From 8c19e3c3d9b4440ac7b3867890ea30cfe5b9a27e Mon Sep 17 00:00:00 2001 From: Irina Levit Date: Tue, 23 Dec 2025 15:50:10 -0500 Subject: [PATCH] Remove .history directory from git tracking --- .gitignore | 1 + .history/eohi1/BS_means_20250922131352.vb | 118 -- .history/eohi1/BS_means_20250922131356.vb | 118 -- .history/eohi1/BS_means_20250922131406.vb | 118 -- ... 01 - domain mean totals _20251007232436.r | 25 - ... 01 - domain mean totals _20251007232440.r | 25 - ... 01 - domain mean totals _20251007232447.r | 25 - ... 01 - domain mean totals _20251007232757.r | 25 - ...ixed anova domain means_20251004193424.rmd | 82 -- ...ixed anova domain means_20251004193431.rmd | 121 -- ...ixed anova domain means_20251004193438.rmd | 175 --- ...ixed anova domain means_20251004193443.rmd | 205 ---- ...ixed anova domain means_20251004193454.rmd | 310 ----- ...ixed anova domain means_20251004193500.rmd | 348 ------ ...ixed anova domain means_20251004193511.rmd | 430 ------- ...ixed anova domain means_20251004193523.rmd | 497 -------- ...ixed anova domain means_20251004193543.rmd | 573 --------- ...ixed anova domain means_20251004193554.rmd | 660 ---------- ...ixed anova domain means_20251004193559.rmd | 660 ---------- ...ixed anova domain means_20251004193613.rmd | 660 ---------- ...ixed anova domain means_20251004193658.rmd | 660 ---------- .history/eohi1/Untitled-1_20251020173338.r | 0 .history/eohi1/Untitled-1_20251020173353.r | 15 - ...on_checks_before_cronbach_20250917154857.r | 162 --- ...on_checks_before_cronbach_20250917154901.r | 162 --- ...on_checks_before_cronbach_20250918115459.r | 164 --- .history/eohi1/brierVARS_20250922124859.vb | 0 .history/eohi1/brierVARS_20250922124900.vb | 82 -- .history/eohi1/brierVARS_20250922125338.vb | 87 -- .history/eohi1/brierVARS_20250922125353.vb | 87 -- .history/eohi1/brierVARS_20250922125648.vb | 87 -- .history/eohi1/brierVARS_20250922125759.vb | 87 -- .history/eohi1/brierVARS_20250922125807.vb | 87 -- .history/eohi1/brierVARS_20250922125821.vb | 87 -- .history/eohi1/brierVARS_20250922130226.vb | 141 --- .history/eohi1/brierVARS_20250922130242.vb | 141 --- .history/eohi1/brierVARS_20250922130423.vb | 141 --- .history/eohi1/brierVARS_20250922130433.vb | 141 --- .history/eohi1/brierVARS_20250922130435.vb | 141 --- .history/eohi1/brierVARS_20250922131020.vb | 141 --- .../eohi1/correlation matrix_20251027115053.r | 0 .../eohi1/correlation matrix_20251027115054.r | 40 - .../eohi1/correlation matrix_20251027115900.r | 50 - .../eohi1/correlation matrix_20251027115902.r | 50 - .../eohi1/correlation matrix_20251027115905.r | 50 - .../eohi1/correlation matrix_20251027120022.r | 50 - .../eohi1/correlation matrix_20251027120056.r | 67 - .../eohi1/correlation matrix_20251027120100.r | 67 - .../eohi1/correlation matrix_20251027120122.r | 67 - .../eohi1/correlation matrix_20251027120345.r | 67 - .../eohi1/correlation matrix_20251027120348.r | 67 - .../eohi1/correlation matrix_20251027120351.r | 67 - .../eohi1/correlation matrix_20251027120448.r | 76 -- .../eohi1/correlation matrix_20251027120450.r | 76 -- .../eohi1/correlation matrix_20251027120505.r | 76 -- .../eohi1/correlation matrix_20251027120720.r | 97 -- .../eohi1/correlation matrix_20251027120722.r | 97 -- .../eohi1/correlation matrix_20251027120752.r | 99 -- .../eohi1/correlation matrix_20251027120754.r | 99 -- .../eohi1/correlation matrix_20251027120804.r | 99 -- .../eohi1/correlation matrix_20251027120919.r | 111 -- .../eohi1/correlation matrix_20251027120930.r | 111 -- .../eohi1/correlation matrix_20251027120933.r | 111 -- .../eohi1/correlation matrix_20251027120955.r | 105 -- .../eohi1/correlation matrix_20251027120958.r | 105 -- .../eohi1/correlation matrix_20251027121016.r | 105 -- .../eohi1/correlation matrix_20251027134544.r | 105 -- .../eohi1/correlation matrix_20251029115844.r | 103 -- ...rier score x eohi and cal_20250922132833.r | 67 - ...rier score x eohi and cal_20250922132837.r | 67 - ...rier score x eohi and cal_20250922132923.r | 74 -- ...rier score x eohi and cal_20250922132932.r | 74 -- ...rier score x eohi and cal_20250922132938.r | 74 -- ...rier score x eohi and cal_20250922133020.r | 75 -- ...rier score x eohi and cal_20250922133026.r | 75 -- ...rier score x eohi and cal_20250922133028.r | 75 -- ...rier score x eohi and cal_20250922133336.r | 86 -- ...rier score x eohi and cal_20250922133339.r | 86 -- ...rier score x eohi and cal_20250922133341.r | 86 -- ...rier score x eohi and cal_20250922134044.r | 86 -- ...rier score x eohi and cal_20250922135638.r | 86 -- ...rier score x eohi and cal_20251008160336.r | 86 -- ...rier score x eohi and cal_20251008160455.r | 86 -- ...rier score x eohi and cal_20251008160500.r | 86 -- ...rier score x eohi and cal_20251008160505.r | 86 -- ...rier score x eohi and cal_20251008160539.r | 76 -- ...rier score x eohi and cal_20251008160544.r | 76 -- ...rier score x eohi and cal_20251008160550.r | 76 -- ...rier score x eohi and cal_20251008162952.r | 76 -- ...rier score x eohi and cal_20251008185636.r | 81 -- ...rier score x eohi and cal_20251008185645.r | 81 -- ...rier score x eohi and cal_20251008185658.r | 81 -- ...ions - eohi x calibration_20250915134710.r | 304 ----- ...ions - eohi x calibration_20250915134720.r | 304 ----- ...ions - eohi x calibration_20250915134733.r | 304 ----- ...ions - eohi x calibration_20250915134827.r | 306 ----- ...ions - eohi x calibration_20250915134916.r | 307 ----- ...ions - eohi x calibration_20250915142201.r | 307 ----- ...ions - eohi x calibration_20250915142559.r | 307 ----- ...ions - eohi x calibration_20250915142603.r | 307 ----- ...ions - eohi x calibration_20250916091406.r | 280 ----- ...ions - eohi x calibration_20250916091630.r | 280 ----- ...ions - eohi x calibration_20250916092909.r | 307 ----- ...ions - eohi x calibration_20250916092913.r | 303 ----- ...ions - eohi x calibration_20250916092918.r | 299 ----- ...ions - eohi x calibration_20250916092925.r | 299 ----- ...ions - eohi x calibration_20250916092959.r | 288 ----- ...ions - eohi x calibration_20250916093530.r | 288 ----- ...ions - eohi x calibration_20250916095303.r | 288 ----- ...ions - eohi x calibration_20250916112614.r | 303 ----- ...ions - eohi x calibration_20250916112923.r | 303 ----- ...ions - eohi x calibration_20250916113008.r | 303 ----- ...ions - eohi x calibration_20250929153154.r | 303 ----- .../correlations - scales_20251007231341.r | 0 .../correlations - scales_20251007232519.r | 4 - .../correlations - scales_20251007232812.r | 2 - .../correlations - scales_20251007233020.r | 74 -- .../correlations - scales_20251007233026.r | 74 -- .../correlations - scales_20251007233050.r | 71 -- .../correlations - scales_20251007233059.r | 71 -- .../correlations - scales_20251007233106.r | 71 -- .../correlations - scales_20251007233151.r | 72 -- .../correlations - scales_20251007233159.r | 80 -- .../correlations - scales_20251007233210.r | 87 -- .../correlations - scales_20251007233219.r | 95 -- .../correlations - scales_20251007233224.r | 95 -- .../correlations - scales_20251007233307.r | 77 -- .../correlations - scales_20251007233311.r | 66 - .../correlations - scales_20251007233319.r | 66 - .../correlations - scales_20251007233424.r | 66 - .../correlations - scales_20251007233541.r | 171 --- .../correlations - scales_20251007233548.r | 171 --- .../correlations - scales_20251007233652.r | 171 --- .../correlations - scales_20251007233731.r | 167 --- .../correlations - scales_20251007233734.r | 166 --- .../correlations - scales_20251007233736.r | 165 --- .../correlations - scales_20251007233739.r | 160 --- .../correlations - scales_20251007233744.r | 160 --- .../correlations - scales_20251007233749.r | 160 --- .../correlations - scales_20251007234152.r | 194 --- .../correlations - scales_20251007234202.r | 194 --- .../correlations - scales_20251007234339.r | 194 --- .../correlations - scales_20251007234406.r | 194 --- .../correlations - scales_20251008001047.r | 194 --- .../correlations - scales_20251008001054.r | 194 --- .../correlations - scales_20251008001102.r | 194 --- .../correlations - scales_20251008005438.r | 194 --- .../correlations - scales_20251008154804.r | 194 --- .../correlations - scales_20251008154934.r | 195 --- .../correlations - scales_20251008154940.r | 191 --- .../correlations - scales_20251008154947.r | 184 --- .../correlations - scales_20251008154952.r | 183 --- .../correlations - scales_20251008155000.r | 183 --- .../correlations - scales_20251008155103.r | 183 --- .../correlations - scales_20251008155139.r | 183 --- .../correlations - scales_20251008155218.r | 171 --- .../correlations - scales_20251008155221.r | 171 --- .../correlations - scales_20251008155223.r | 171 --- .../correlations - scales_20251008155225.r | 171 --- .../correlations - scales_20251008171710.r | 171 --- ... average over time frames_20251008000048.r | 45 - ... average over time frames_20251008000055.r | 45 - ... average over time frames_20251008000150.r | 36 - ... average over time frames_20251008000158.r | 31 - ... average over time frames_20251008000203.r | 31 - ... average over time frames_20251008000212.r | 31 - ... average over time frames_20251008000542.r | 38 - ... average over time frames_20251008000547.r | 45 - ... average over time frames_20251008000552.r | 45 - ... average over time frames_20251008000600.r | 45 - ... average over time frames_20251008000956.r | 45 - ... domain specific EHI vars_20251008152448.r | 0 ... domain specific EHI vars_20251008152542.r | 2 - ... domain specific EHI vars_20251008153309.r | 104 -- ... domain specific EHI vars_20251008153323.r | 104 -- ... domain specific EHI vars_20251008153333.r | 104 -- ... domain specific EHI vars_20251008153816.r | 104 -- ... domain specific EHI vars_20251008162958.r | 104 -- ...- CORRECT ehi var means_20251008153939.txt | 0 ...- CORRECT ehi var means_20251008153946.txt | 5 - ...4 - CORRECT ehi var means_20251008153958.r | 0 ...4 - CORRECT ehi var means_20251008153959.r | 5 - ...4 - CORRECT ehi var means_20251008154513.r | 167 --- ...4 - CORRECT ehi var means_20251008154517.r | 167 --- ...4 - CORRECT ehi var means_20251008154531.r | 167 --- ...4 - CORRECT ehi var means_20251008162957.r | 167 --- ... recoded 3 ordinal levels_20251027113023.r | 0 ... recoded 3 ordinal levels_20251027113024.r | 5 - ... recoded 3 ordinal levels_20251027113338.r | 13 - ... recoded 3 ordinal levels_20251027113347.r | 13 - ... recoded 3 ordinal levels_20251027113517.r | 12 - ... recoded 3 ordinal levels_20251027113654.r | 37 - ... recoded 3 ordinal levels_20251027113658.r | 37 - ... recoded 3 ordinal levels_20251027113726.r | 37 - ... recoded 3 ordinal levels_20251027113730.r | 37 - ... recoded 3 ordinal levels_20251027113735.r | 37 - ... recoded 3 ordinal levels_20251027113937.r | 39 - ... recoded 3 ordinal levels_20251027113941.r | 39 - ... recoded 3 ordinal levels_20251027113947.r | 38 - ... recoded 3 ordinal levels_20251027114131.r | 38 - ... recoded 3 ordinal levels_20251027115643.r | 49 - ... recoded 3 ordinal levels_20251027115647.r | 49 - ... recoded 3 ordinal levels_20251027115649.r | 49 - ... recoded 3 ordinal levels_20251027115718.r | 38 - ... recoded 3 ordinal levels_20251027115829.r | 50 - ... recoded 3 ordinal levels_20251027115832.r | 50 - ... recoded 3 ordinal levels_20251027115834.r | 50 - ... recoded 3 ordinal levels_20251027115845.r | 38 - ... recoded 3 ordinal levels_20251027134607.r | 38 - ...- gen knowledge questions_20250918115552.r | 0 ...- gen knowledge questions_20250918115553.r | 32 - ...- gen knowledge questions_20250918115703.r | 34 - ...- gen knowledge questions_20250918120055.r | 39 - ...- gen knowledge questions_20250918120100.r | 39 - ...- gen knowledge questions_20250918120102.r | 39 - ...- gen knowledge questions_20250918120515.r | 35 - ...- gen knowledge questions_20250918120600.r | 64 - ...- gen knowledge questions_20250918120656.r | 64 - ...- gen knowledge questions_20250918120727.r | 63 - ...- gen knowledge questions_20250918122358.r | 55 - ...- gen knowledge questions_20250918122401.r | 55 - ...- gen knowledge questions_20250918122413.r | 55 - ...- gen knowledge questions_20250918122634.r | 70 -- ...- gen knowledge questions_20250918122637.r | 70 -- ...- gen knowledge questions_20250918122638.r | 70 -- ...- gen knowledge questions_20250918123114.r | 88 -- ...- gen knowledge questions_20250918123117.r | 88 -- ...- gen knowledge questions_20250918123133.r | 88 -- ...- gen knowledge questions_20250918124915.r | 88 -- ...- gen knowledge questions_20250918145603.r | 101 -- ...- gen knowledge questions_20250918145606.r | 101 -- ...- gen knowledge questions_20250918145728.r | 101 -- ...- gen knowledge questions_20250918155602.r | 106 -- ...- gen knowledge questions_20250918155604.r | 106 -- ...- gen knowledge questions_20250918155605.r | 106 -- ...- gen knowledge questions_20250918155636.r | 107 -- .../e1 - reliability ehi_20251029093310.r | 0 .../e1 - reliability ehi_20251029093311.r | 56 - .../e1 - reliability ehi_20251029093545.r | 56 - .../e1 - reliability ehi_20251029094220.r | 80 -- .../e1 - reliability ehi_20251029094235.r | 80 -- .../e1 - reliability ehi_20251029094336.r | 88 -- .../e1 - reliability ehi_20251029094344.r | 88 -- .../e1 - reliability ehi_20251029094408.r | 88 -- .../eohi1/minimal_test_20251004194428.rmd | 16 - .../eohi1/minimal_test_20251004194431.rmd | 16 - .../eohi1/minimal_test_20251004194608.rmd | 17 - .../eohi1/minimal_test_20251004194638.rmd | 1 - .../eohi1/mixed anova - DGEN_20251003132154.r | 0 .../eohi1/mixed anova - DGEN_20251003132235.r | 21 - .../eohi1/mixed anova - DGEN_20251003132528.r | 671 ---------- .../eohi1/mixed anova - DGEN_20251003132534.r | 671 ---------- .../eohi1/mixed anova - DGEN_20251003132751.r | 671 ---------- .../eohi1/mixed anova - DGEN_20251006125959.r | 671 ---------- .../eohi1/mixed anova - DGEN_20251006150203.r | 673 ---------- .../eohi1/mixed anova - DGEN_20251006150325.r | 667 ---------- .../eohi1/mixed anova - DGEN_20251006150338.r | 662 ---------- .../eohi1/mixed anova - DGEN_20251006150343.r | 661 ---------- .../eohi1/mixed anova - DGEN_20251006150351.r | 654 ---------- .../eohi1/mixed anova - DGEN_20251006150356.r | 654 ---------- .../eohi1/mixed anova - DGEN_20251006150414.r | 642 ---------- .../eohi1/mixed anova - DGEN_20251006150433.r | 725 ----------- .../eohi1/mixed anova - DGEN_20251006150451.r | 725 ----------- .../eohi1/mixed anova - DGEN_20251006150515.r | 725 ----------- .../eohi1/mixed anova - DGEN_20251006151447.r | 774 ------------ .../eohi1/mixed anova - DGEN_20251006151454.r | 774 ------------ .../eohi1/mixed anova - DGEN_20251006151507.r | 736 ----------- .../eohi1/mixed anova - DGEN_20251006151514.r | 736 ----------- .../eohi1/mixed anova - DGEN_20251006151518.r | 736 ----------- .../eohi1/mixed anova - DGEN_20251006152215.r | 743 ----------- .../eohi1/mixed anova - DGEN_20251006152226.r | 743 ----------- .../eohi1/mixed anova - DGEN_20251006152312.r | 743 ----------- .../eohi1/mixed anova - DGEN_20251006152851.r | 738 ----------- .../eohi1/mixed anova - DGEN_20251006152926.r | 738 ----------- .../eohi1/mixed anova - DGEN_20251006152934.r | 738 ----------- .../eohi1/mixed anova - DGEN_20251006153823.r | 752 ------------ .../eohi1/mixed anova - DGEN_20251006153831.r | 752 ------------ .../eohi1/mixed anova - DGEN_20251006153845.r | 752 ------------ .../eohi1/mixed anova - DGEN_20251006154345.r | 742 ----------- .../eohi1/mixed anova - DGEN_20251006154352.r | 742 ----------- .../eohi1/mixed anova - DGEN_20251006154403.r | 742 ----------- .../eohi1/mixed anova - DGEN_20251006155006.r | 756 ------------ .../eohi1/mixed anova - DGEN_20251006155018.r | 756 ------------ .../eohi1/mixed anova - DGEN_20251006155049.r | 756 ------------ .../eohi1/mixed anova - DGEN_20251006155243.r | 748 ----------- .../eohi1/mixed anova - DGEN_20251006155250.r | 748 ----------- .../eohi1/mixed anova - DGEN_20251006155253.r | 748 ----------- .../eohi1/mixed anova - DGEN_20251006155346.r | 749 ----------- .../eohi1/mixed anova - DGEN_20251006155349.r | 749 ----------- .../eohi1/mixed anova - DGEN_20251006155353.r | 749 ----------- .../eohi1/mixed anova - DGEN_20251006155709.r | 749 ----------- .../eohi1/mixed anova - DGEN_20251006172448.r | 818 ------------ .../eohi1/mixed anova - DGEN_20251006172451.r | 818 ------------ .../eohi1/mixed anova - DGEN_20251006172503.r | 818 ------------ .../eohi1/mixed anova - DGEN_20251006172600.r | 818 ------------ .../eohi1/mixed anova - DGEN_20251006172604.r | 818 ------------ .../eohi1/mixed anova - DGEN_20251006172636.r | 818 ------------ .../eohi1/mixed anova - DGEN_20251006181939.r | 818 ------------ .../eohi1/mixed anova - DGEN_20251006191125.r | 818 ------------ .../eohi1/mixed anova - DGEN_20251006200121.r | 892 -------------- .../eohi1/mixed anova - DGEN_20251006200138.r | 892 -------------- .../eohi1/mixed anova - DGEN_20251006200320.r | 818 ------------ .../eohi1/mixed anova - DGEN_20251006201108.r | 818 ------------ .../eohi1/mixed anova - DGEN_20251006201257.r | 818 ------------ .../eohi1/mixed anova - DGEN_20251007162848.r | 818 ------------ .../eohi1/mixed anova - DGEN_20251007162926.r | 875 ------------- .../eohi1/mixed anova - DGEN_20251007162939.r | 875 ------------- .../eohi1/mixed anova - DGEN_20251007162949.r | 875 ------------- .../eohi1/mixed anova - DGEN_20251007162951.r | 875 ------------- .../eohi1/mixed anova - DGEN_20251007180951.r | 875 ------------- .../eohi1/mixed anova - DGEN_20251010152730.r | 875 ------------- .../eohi1/mixed anova - DGEN_20251010165036.r | 875 ------------- ...ova - domain means SIMPLE_20251003125947.r | 320 ----- ...ova - domain means SIMPLE_20251003130013.r | 320 ----- ...ixed anova - domain means_20250912153102.r | 272 ---- ...ixed anova - domain means_20250912153232.r | 274 ----- ...ixed anova - domain means_20250912153241.r | 274 ----- ...ixed anova - domain means_20250912153323.r | 274 ----- ...ixed anova - domain means_20250912153326.r | 274 ----- ...ixed anova - domain means_20250912153327.r | 274 ----- ...ixed anova - domain means_20250912153352.r | 274 ----- ...ixed anova - domain means_20250912153354.r | 274 ----- ...ixed anova - domain means_20250912153402.r | 274 ----- ...ixed anova - domain means_20250912154157.r | 274 ----- ...ixed anova - domain means_20250912154200.r | 274 ----- ...ixed anova - domain means_20250912154202.r | 274 ----- ...ixed anova - domain means_20250912154303.r | 247 ---- ...ixed anova - domain means_20250912154304.r | 247 ---- ...ixed anova - domain means_20250912154305.r | 247 ---- ...ixed anova - domain means_20250912154904.r | 247 ---- ...ixed anova - domain means_20250912154909.r | 247 ---- ...ixed anova - domain means_20250912155010.r | 246 ---- ...ixed anova - domain means_20250912155017.r | 246 ---- ...ixed anova - domain means_20250912155100.r | 246 ---- ...ixed anova - domain means_20250912155158.r | 246 ---- ...ixed anova - domain means_20250912155205.r | 246 ---- ...ixed anova - domain means_20250912155217.r | 263 ---- ...ixed anova - domain means_20250912155223.r | 263 ---- ...ixed anova - domain means_20250912155224.r | 263 ---- ...ixed anova - domain means_20250912155246.r | 275 ----- ...ixed anova - domain means_20250912155247.r | 275 ----- ...ixed anova - domain means_20250912155253.r | 275 ----- ...ixed anova - domain means_20250912155329.r | 275 ----- ...ixed anova - domain means_20250912155333.r | 279 ----- ...ixed anova - domain means_20250912155338.r | 279 ----- ...ixed anova - domain means_20250912155352.r | 279 ----- ...ixed anova - domain means_20250912155830.r | 367 ------ ...ixed anova - domain means_20250912155832.r | 367 ------ ...ixed anova - domain means_20250912155837.r | 367 ------ ...ixed anova - domain means_20250912155915.r | 374 ------ ...ixed anova - domain means_20250912155919.r | 374 ------ ...ixed anova - domain means_20250912155922.r | 374 ------ ...ixed anova - domain means_20250912155944.r | 378 ------ ...ixed anova - domain means_20250912155948.r | 384 ------ ...ixed anova - domain means_20250912155953.r | 384 ------ ...ixed anova - domain means_20250912160029.r | 380 ------ ...ixed anova - domain means_20250912160033.r | 380 ------ ...ixed anova - domain means_20250912160053.r | 380 ------ ...ixed anova - domain means_20250912160118.r | 387 ------ ...ixed anova - domain means_20250912160125.r | 398 ------ ...ixed anova - domain means_20250912160131.r | 398 ------ ...ixed anova - domain means_20250912160133.r | 398 ------ ...ixed anova - domain means_20250912160206.r | 438 ------- ...ixed anova - domain means_20250912160213.r | 438 ------- ...ixed anova - domain means_20250912160217.r | 438 ------- ...ixed anova - domain means_20250912160416.r | 438 ------- ...ixed anova - domain means_20250912160420.r | 438 ------- ...ixed anova - domain means_20250912160432.r | 438 ------- ...ixed anova - domain means_20250912160551.r | 438 ------- ...ixed anova - domain means_20250912160604.r | 438 ------- ...ixed anova - domain means_20250912160607.r | 438 ------- ...ixed anova - domain means_20250912160610.r | 438 ------- ...ixed anova - domain means_20250912160644.r | 438 ------- ...ixed anova - domain means_20250912160648.r | 438 ------- ...ixed anova - domain means_20250912160652.r | 438 ------- ...ixed anova - domain means_20250912160655.r | 438 ------- ...ixed anova - domain means_20250912161036.r | 427 ------- ...ixed anova - domain means_20250912161041.r | 427 ------- ...ixed anova - domain means_20250912161046.r | 427 ------- ...ixed anova - domain means_20250912161558.r | 410 ------- ...ixed anova - domain means_20250912161610.r | 392 ------ ...ixed anova - domain means_20250912161617.r | 392 ------ ...ixed anova - domain means_20250912161619.r | 392 ------ ...ixed anova - domain means_20250912161932.r | 403 ------ ...ixed anova - domain means_20250912161937.r | 403 ------ ...ixed anova - domain means_20250912162001.r | 403 ------ ...ixed anova - domain means_20250912162009.r | 403 ------ ...ixed anova - domain means_20250912162037.r | 403 ------ ...ixed anova - domain means_20250912162116.r | 403 ------ ...ixed anova - domain means_20250912162139.r | 479 -------- ...ixed anova - domain means_20250912162145.r | 479 -------- ...ixed anova - domain means_20250912162151.r | 479 -------- ...ixed anova - domain means_20250912162247.r | 484 -------- ...ixed anova - domain means_20250912162250.r | 484 -------- ...ixed anova - domain means_20250912162255.r | 484 -------- ...ixed anova - domain means_20250912162832.r | 477 ------- ...ixed anova - domain means_20250912162837.r | 467 ------- ...ixed anova - domain means_20250912162851.r | 467 ------- ...ixed anova - domain means_20250912162858.r | 467 ------- ...ixed anova - domain means_20250912163014.r | 467 ------- ...ixed anova - domain means_20250912164147.r | 471 ------- ...ixed anova - domain means_20250915110342.r | 460 ------- ...ixed anova - domain means_20250915110402.r | 417 ------- ...ixed anova - domain means_20250915110435.r | 412 ------- ...ixed anova - domain means_20250915110448.r | 405 ------ ...ixed anova - domain means_20250915110457.r | 405 ------ ...ixed anova - domain means_20250915110504.r | 385 ------ ...ixed anova - domain means_20250915110508.r | 385 ------ ...ixed anova - domain means_20250915110512.r | 381 ------ ...ixed anova - domain means_20250915110518.r | 381 ------ ...ixed anova - domain means_20250915110521.r | 380 ------ ...ixed anova - domain means_20250915110535.r | 381 ------ ...ixed anova - domain means_20250915110539.r | 380 ------ ...ixed anova - domain means_20250915110547.r | 380 ------ ...ixed anova - domain means_20250915111101.r | 383 ------ ...ixed anova - domain means_20250915111110.r | 383 ------ ...ixed anova - domain means_20250915111114.r | 383 ------ ...ixed anova - domain means_20250915111252.r | 395 ------ ...ixed anova - domain means_20250915111308.r | 409 ------ ...ixed anova - domain means_20250915111314.r | 409 ------ ...ixed anova - domain means_20250915112435.r | 409 ------ ...ixed anova - domain means_20250915112521.r | 409 ------ ...ixed anova - domain means_20250915112528.r | 409 ------ ...ixed anova - domain means_20250915112533.r | 409 ------ ...ixed anova - domain means_20250915112607.r | 412 ------- ...ixed anova - domain means_20250915112612.r | 412 ------- ...ixed anova - domain means_20250915112617.r | 412 ------- ...ixed anova - domain means_20250915113543.r | 447 ------- ...ixed anova - domain means_20250915113742.r | 477 ------- ...ixed anova - domain means_20250915113749.r | 477 ------- ...ixed anova - domain means_20250915114518.r | 477 ------- ...ixed anova - domain means_20250915114619.r | 477 ------- ...ixed anova - domain means_20250915114727.r | 477 ------- ...ixed anova - domain means_20250915114729.r | 477 ------- ...ixed anova - domain means_20250915114731.r | 477 ------- ...ixed anova - domain means_20250915114817.r | 477 ------- ...ixed anova - domain means_20250915115032.r | 477 ------- ...ixed anova - domain means_20250915120001.r | 544 -------- ...ixed anova - domain means_20250915120010.r | 544 -------- ...ixed anova - domain means_20250915120029.r | 612 --------- ...ixed anova - domain means_20250915120039.r | 612 --------- ...ixed anova - domain means_20250915120050.r | 612 --------- ...ixed anova - domain means_20250915120848.r | 659 ---------- ...ixed anova - domain means_20250915120855.r | 659 ---------- ...ixed anova - domain means_20250915120900.r | 659 ---------- ...ixed anova - domain means_20250915121033.r | 685 ----------- ...ixed anova - domain means_20250915121043.r | 736 ----------- ...ixed anova - domain means_20250915121049.r | 736 ----------- ...ixed anova - domain means_20250915121051.r | 736 ----------- ...ixed anova - domain means_20250915121136.r | 745 ----------- ...ixed anova - domain means_20250915121141.r | 745 ----------- ...ixed anova - domain means_20250915121152.r | 774 ------------ ...ixed anova - domain means_20250915121201.r | 774 ------------ ...ixed anova - domain means_20250915121212.r | 774 ------------ ...ixed anova - domain means_20250915121317.r | 774 ------------ ...ixed anova - domain means_20250915122220.r | 736 ----------- ...ixed anova - domain means_20250915122231.r | 719 ----------- ...ixed anova - domain means_20250915122243.r | 691 ----------- ...ixed anova - domain means_20250915122255.r | 750 ----------- ...ixed anova - domain means_20250915122301.r | 754 ------------ ...ixed anova - domain means_20250915122305.r | 749 ----------- ...ixed anova - domain means_20250915122307.r | 748 ----------- ...ixed anova - domain means_20250915122312.r | 747 ----------- ...ixed anova - domain means_20250915122315.r | 745 ----------- ...ixed anova - domain means_20250915122318.r | 744 ----------- ...ixed anova - domain means_20250915122328.r | 744 ----------- ...ixed anova - domain means_20250915122355.r | 744 ----------- ...ixed anova - domain means_20250915122357.r | 744 ----------- ...ixed anova - domain means_20250915122438.r | 734 ----------- ...ixed anova - domain means_20250915122442.r | 732 ----------- ...ixed anova - domain means_20250915122447.r | 733 ----------- ...ixed anova - domain means_20250915122455.r | 744 ----------- ...ixed anova - domain means_20250915122501.r | 744 ----------- ...ixed anova - domain means_20250915122534.r | 743 ----------- ...ixed anova - domain means_20250915122540.r | 743 ----------- ...ixed anova - domain means_20250915122551.r | 743 ----------- ...ixed anova - domain means_20250915122806.r | 743 ----------- ...ixed anova - domain means_20250915122811.r | 743 ----------- ...ixed anova - domain means_20250915122819.r | 743 ----------- ...ixed anova - domain means_20250915122909.r | 743 ----------- ...ixed anova - domain means_20250915122915.r | 743 ----------- ...ixed anova - domain means_20250915122919.r | 743 ----------- ...ixed anova - domain means_20250915132646.r | 743 ----------- ...ixed anova - domain means_20250915133137.r | 791 ------------ ...ixed anova - domain means_20250915133143.r | 791 ------------ ...ixed anova - domain means_20250915133150.r | 791 ------------ ...ixed anova - domain means_20250915133657.r | 791 ------------ ...ixed anova - domain means_20250915133739.r | 791 ------------ ...ixed anova - domain means_20250915133916.r | 791 ------------ ...ixed anova - domain means_20250916095306.r | 791 ------------ ...ixed anova - domain means_20250916100623.r | 830 ------------- ...ixed anova - domain means_20250916100629.r | 830 ------------- ...ixed anova - domain means_20250916100739.r | 830 ------------- ...ixed anova - domain means_20250916104746.r | 836 ------------- ...ixed anova - domain means_20250916104757.r | 836 ------------- ...ixed anova - domain means_20250916104830.r | 836 ------------- ...ixed anova - domain means_20250917120959.r | 836 ------------- ...ixed anova - domain means_20251001163157.r | 836 ------------- ...ixed anova - domain means_20251001171405.r | 800 ------------ ...ixed anova - domain means_20251001171456.r | 645 ---------- ...ixed anova - domain means_20251001171513.r | 610 --------- ...ixed anova - domain means_20251001171529.r | 614 --------- ...ixed anova - domain means_20251001171548.r | 614 --------- ...ixed anova - domain means_20251001171605.r | 614 --------- ...ixed anova - domain means_20251001171616.r | 614 --------- ...ixed anova - domain means_20251001171654.r | 614 --------- ...ixed anova - domain means_20251001171824.r | 615 ---------- ...ixed anova - domain means_20251001171840.r | 615 ---------- ...ixed anova - domain means_20251001172039.r | 615 ---------- ...ixed anova - domain means_20251001174736.r | 615 ---------- ...ixed anova - domain means_20251001174748.r | 615 ---------- ...ixed anova - domain means_20251001174749.r | 615 ---------- ...ixed anova - domain means_20251003122510.r | 617 ---------- ...ixed anova - domain means_20251003122522.r | 638 ---------- ...ixed anova - domain means_20251003122534.r | 638 ---------- ...ixed anova - domain means_20251003122601.r | 638 ---------- ...ixed anova - domain means_20251003122606.r | 638 ---------- ...ixed anova - domain means_20251003122621.r | 648 ---------- ...ixed anova - domain means_20251003122630.r | 648 ---------- ...ixed anova - domain means_20251003122640.r | 648 ---------- ...ixed anova - domain means_20251003123628.r | 652 ---------- ...ixed anova - domain means_20251003123637.r | 652 ---------- ...ixed anova - domain means_20251003124045.r | 652 ---------- ...ixed anova - domain means_20251003124352.r | 631 ---------- ...ixed anova - domain means_20251003124400.r | 631 ---------- ...ixed anova - domain means_20251003124418.r | 634 ---------- ...ixed anova - domain means_20251003124429.r | 635 ---------- ...ixed anova - domain means_20251003124453.r | 638 ---------- ...ixed anova - domain means_20251003124504.r | 638 ---------- ...ixed anova - domain means_20251003124512.r | 638 ---------- ...ixed anova - domain means_20251003124527.r | 638 ---------- ...ixed anova - domain means_20251003124539.r | 632 ---------- ...ixed anova - domain means_20251003124544.r | 632 ---------- ...ixed anova - domain means_20251003124545.r | 632 ---------- ...ixed anova - domain means_20251003124614.r | 627 ---------- ...ixed anova - domain means_20251003124632.r | 624 ---------- ...ixed anova - domain means_20251003124639.r | 629 ---------- ...ixed anova - domain means_20251003124643.r | 629 ---------- ...ixed anova - domain means_20251003124942.r | 671 ---------- ...ixed anova - domain means_20251003124948.r | 671 ---------- ...ixed anova - domain means_20251003124952.r | 671 ---------- ...ixed anova - domain means_20251003125302.r | 665 ---------- ...ixed anova - domain means_20251003125335.r | 668 ---------- ...ixed anova - domain means_20251003130013.r | 668 ---------- ...ixed anova - domain means_20251003130111.r | 668 ---------- ...ixed anova - domain means_20251003134651.r | 728 ----------- ...ixed anova - domain means_20251003134713.r | 728 ----------- ...ixed anova - domain means_20251003134732.r | 744 ----------- ...ixed anova - domain means_20251003134803.r | 766 ------------ ...ixed anova - domain means_20251003134813.r | 766 ------------ ...ixed anova - domain means_20251003134814.r | 766 ------------ ...ixed anova - domain means_20251003134857.r | 766 ------------ ...ixed anova - domain means_20251003135506.r | 767 ------------ ...ixed anova - domain means_20251003135510.r | 767 ------------ ...ixed anova - domain means_20251003135514.r | 767 ------------ ...ixed anova - domain means_20251003140001.r | 767 ------------ ...ixed anova - domain means_20251003140127.r | 767 ------------ ...ixed anova - domain means_20251003140135.r | 767 ------------ ...ixed anova - domain means_20251003140812.r | 865 ------------- ...ixed anova - domain means_20251003140821.r | 865 ------------- ...ixed anova - domain means_20251003141902.r | 767 ------------ ...ixed anova - domain means_20251003141926.r | 865 ------------- ...ixed anova - domain means_20251003142537.r | 865 ------------- ...ixed anova - domain means_20251003142630.r | 865 ------------- ...ixed anova - domain means_20251003142635.r | 865 ------------- ...ixed anova - domain means_20251003142645.r | 865 ------------- ...ixed anova - domain means_20251003142752.r | 863 ------------- ...ixed anova - domain means_20251003142806.r | 863 ------------- ...ixed anova - domain means_20251003142812.r | 863 ------------- ...ixed anova - domain means_20251003143103.r | 863 ------------- ...ixed anova - domain means_20251004194541.r | 578 --------- ...ixed anova - domain means_20251006125951.r | 578 --------- ...ixed anova - domain means_20251006131233.r | 666 ---------- ...ixed anova - domain means_20251006131245.r | 666 ---------- ...ixed anova - domain means_20251006142529.r | 666 ---------- ...ixed anova - domain means_20251006142637.r | 669 ---------- ...ixed anova - domain means_20251006142646.r | 681 ---------- ...ixed anova - domain means_20251006142658.r | 681 ---------- ...ixed anova - domain means_20251006142703.r | 681 ---------- ...ixed anova - domain means_20251006145249.r | 681 ---------- ...ixed anova - domain means_20251006152026.r | 671 ---------- ...ixed anova - domain means_20251006152038.r | 661 ---------- ...ixed anova - domain means_20251006152054.r | 661 ---------- ...ixed anova - domain means_20251006153456.r | 661 ---------- ...ixed anova - domain means_20251006155838.r | 682 ---------- ...ixed anova - domain means_20251006155859.r | 704 ----------- ...ixed anova - domain means_20251006155915.r | 704 ----------- ...ixed anova - domain means_20251006155938.r | 704 ----------- ...ixed anova - domain means_20251006162736.r | 763 ------------ ...ixed anova - domain means_20251006162743.r | 763 ------------ ...ixed anova - domain means_20251006162748.r | 763 ------------ ...ixed anova - domain means_20251006162833.r | 769 ------------ ...ixed anova - domain means_20251006162839.r | 769 ------------ ...ixed anova - domain means_20251006162940.r | 769 ------------ ...ixed anova - domain means_20251006181905.r | 769 ------------ ...ixed anova - domain means_20251006181908.r | 769 ------------ ...ixed anova - domain means_20251006181914.r | 769 ------------ ...ixed anova - domain means_20251006181923.r | 769 ------------ ...ixed anova - domain means_20251006181934.r | 769 ------------ ...ixed anova - domain means_20251006181935.r | 769 ------------ ...ixed anova - domain means_20251006183415.r | 769 ------------ ...ixed anova - domain means_20251006223538.r | 769 ------------ ...ixed anova - domain means_20251006225002.r | 864 ------------- ...ixed anova - domain means_20251006225006.r | 864 ------------- ...ixed anova - domain means_20251006225025.r | 864 ------------- ...ixed anova - domain means_20251006225127.r | 853 ------------- ...ixed anova - domain means_20251006225134.r | 852 ------------- ...ixed anova - domain means_20251006225138.r | 852 ------------- ...ixed anova - domain means_20251006225148.r | 852 ------------- ...ixed anova - domain means_20251006225428.r | 852 ------------- ...ixed anova - domain means_20251007155303.r | 852 ------------- ...ixed anova - domain means_20251007162057.r | 761 ------------ ...ixed anova - domain means_20251007162101.r | 761 ------------ ...ixed anova - domain means_20251007162111.r | 669 ---------- ...ixed anova - domain means_20251007162115.r | 669 ---------- ...ixed anova - domain means_20251007162122.r | 723 ----------- ...ixed anova - domain means_20251007162130.r | 723 ----------- ...ixed anova - domain means_20251007162138.r | 723 ----------- ...ixed anova - domain means_20251007162843.r | 723 ----------- ...ixed anova - domain means_20251007182023.r | 768 ------------ ...ixed anova - domain means_20251007182033.r | 768 ------------ ...ixed anova - domain means_20251007182343.r | 768 ------------ ...ixed anova - domain means_20251007182949.r | 768 ------------ ...ixed anova - domain means_20251007182951.r | 768 ------------ ...ixed anova - domain means_20251007182953.r | 768 ------------ ...ixed anova - domain means_20251007183630.r | 769 ------------ ...ixed anova - domain means_20251007183634.r | 769 ------------ ...ixed anova - domain means_20251007183638.r | 769 ------------ ...ixed anova - domain means_20251007183824.r | 769 ------------ ...ixed anova - domain means_20251010145938.r | 769 ------------ ...mixed anova - personality_20250916123628.r | 765 ------------ ...mixed anova - personality_20250916123639.r | 765 ------------ ...mixed anova - personality_20250916123640.r | 765 ------------ ...mixed anova - personality_20250916125522.r | 765 ------------ ...mixed anova - personality_20250916130413.r | 765 ------------ ...mixed anova - personality_20250917121011.r | 765 ------------ ...mixed anova - preferences_20250916113624.r | 836 ------------- ...mixed anova - preferences_20250916113646.r | 836 ------------- ...mixed anova - preferences_20250916113702.r | 836 ------------- ...mixed anova - preferences_20250916113720.r | 836 ------------- ...mixed anova - preferences_20250916113743.r | 836 ------------- ...mixed anova - preferences_20250916113752.r | 836 ------------- ...mixed anova - preferences_20250916113803.r | 836 ------------- ...mixed anova - preferences_20250916113806.r | 836 ------------- ...mixed anova - preferences_20250916120325.r | 836 ------------- ...mixed anova - preferences_20250916120409.r | 795 ------------ ...mixed anova - preferences_20250916120419.r | 769 ------------ ...mixed anova - preferences_20250916120427.r | 767 ------------ ...mixed anova - preferences_20250916120436.r | 763 ------------ ...mixed anova - preferences_20250916120437.r | 763 ------------ ...mixed anova - preferences_20250916120515.r | 765 ------------ ...mixed anova - preferences_20250916120522.r | 765 ------------ ...mixed anova - preferences_20250916120534.r | 765 ------------ .../mixed anova - values_20250916125551.r | 0 .../mixed anova - values_20250916125552.r | 765 ------------ .../mixed anova - values_20250916125857.r | 765 ------------ .../mixed anova - values_20250916125907.r | 765 ------------ .../mixed anova - values_20250916125940.r | 765 ------------ ...adme_domain_mixed_anova_20251002121520.txt | Bin 69326 -> 0 bytes ...adme_domain_mixed_anova_20251002121755.txt | Bin 69326 -> 0 bytes ...regression e1 - edu x ehi_20251020100405.r | 10 - ...regression e1 - edu x ehi_20251020100412.r | 10 - ...regression e1 - edu x ehi_20251020100550.r | 11 - ...regression e1 - edu x ehi_20251020100952.r | 16 - ...regression e1 - edu x ehi_20251020101841.r | 35 - ...regression e1 - edu x ehi_20251020103253.r | 45 - ...regression e1 - edu x ehi_20251020103850.r | 47 - ...regression e1 - edu x ehi_20251020113946.r | 118 -- ...regression e1 - edu x ehi_20251020133831.r | 159 --- ...regression e1 - edu x ehi_20251020134231.r | 159 --- ...regression e1 - edu x ehi_20251020173226.r | 161 --- ...regression e1 - edu x ehi_20251020173252.r | 161 --- ...regression e1 - edu x ehi_20251023140538.r | 232 ---- ...sion e1 - ehi x sex x age_20251020173352.r | 15 - ...sion e1 - ehi x sex x age_20251020173438.r | 16 - ...sion e1 - ehi x sex x age_20251020174241.r | 26 - ...sion e1 - ehi x sex x age_20251020174522.r | 39 - ...sion e1 - ehi x sex x age_20251020175347.r | 96 -- ...sion e1 - ehi x sex x age_20251020180330.r | 96 -- ...sion e1 - ehi x sex x age_20251021102925.r | 100 -- ...sion e1 - ehi x sex x age_20251021104421.r | 101 -- ...sion e1 - ehi x sex x age_20251021111526.r | 105 -- ...sion e1 - ehi x sex x age_20251021120315.r | 151 --- ...sion e1 - ehi x sex x age_20251023105759.r | 320 ----- ...ressions e1 - assumptions_20251015154931.r | 3 - ...ressions e1 - assumptions_20251016134509.r | 4 - ...ressions e1 - assumptions_20251016142437.r | 55 - ...ressions e1 - assumptions_20251016142502.r | 235 ---- ...ressions e1 - assumptions_20251016142529.r | 401 ------ ...ressions e1 - assumptions_20251016142552.r | 401 ------ ...ressions e1 - assumptions_20251016142605.r | 401 ------ ...ressions e1 - assumptions_20251016142612.r | 401 ------ ...ressions e1 - assumptions_20251016142849.r | 393 ------ ...ressions e1 - assumptions_20251016142956.r | 392 ------ ...ressions e1 - assumptions_20251016143059.r | 397 ------ ...ressions e1 - assumptions_20251016143109.r | 397 ------ ...ressions e1 - assumptions_20251016143341.r | 397 ------ ...ressions e1 - assumptions_20251016143415.r | 397 ------ ...ressions e1 - assumptions_20251016143441.r | 397 ------ ...ssions e1 - assumptions_20251016144540.qmd | 463 ------- ...ssions e1 - assumptions_20251016144554.qmd | 463 ------- ...ssions e1 - assumptions_20251016144652.qmd | 463 ------- ...ssions e1 - assumptions_20251016145531.qmd | 463 ------- ...ssions e1 - assumptions_20251016145539.qmd | 463 ------- ...ssions e1 - assumptions_20251016145845.qmd | 463 ------- ...ssions e1 - assumptions_20251016150135.qmd | 463 ------- ...ssions e1 - assumptions_20251016150355.qmd | 467 ------- ...ssions e1 - assumptions_20251016150436.qmd | 467 ------- ...ssions e1 - assumptions_20251016150455.qmd | 479 -------- ...ssions e1 - assumptions_20251016150502.qmd | 479 -------- ...ssions e1 - assumptions_20251016150507.qmd | 479 -------- ...ssions e1 - assumptions_20251016150527.qmd | 485 -------- ...ssions e1 - assumptions_20251016150545.qmd | 485 -------- ...ssions e1 - assumptions_20251016150558.qmd | 485 -------- ...ssions e1 - assumptions_20251016154019.qmd | 489 -------- ...ssions e1 - assumptions_20251016154024.qmd | 490 -------- ...ssions e1 - assumptions_20251016154032.qmd | 491 -------- ...ssions e1 - assumptions_20251016154037.qmd | 492 -------- ...ssions e1 - assumptions_20251016154047.qmd | 492 -------- ...ssions e1 - assumptions_20251016154106.qmd | 492 -------- ...ssions e1 - assumptions_20251016154110.qmd | 492 -------- ...ssions e1 - assumptions_20251016154202.qmd | 494 -------- ...ssions e1 - assumptions_20251016154210.qmd | 495 -------- ...ssions e1 - assumptions_20251016154217.qmd | 495 -------- ...ssions e1 - assumptions_20251016154250.qmd | 495 -------- ...ssions e1 - assumptions_20251016154501.qmd | 497 -------- ...ssions e1 - assumptions_20251016154514.qmd | 497 -------- ...ssions e1 - assumptions_20251016154524.qmd | 497 -------- ...ssions e1 - assumptions_20251016154558.qmd | 502 -------- ...ssions e1 - assumptions_20251016154609.qmd | 502 -------- ...ssions e1 - assumptions_20251016154622.qmd | 502 -------- ...ssions e1 - assumptions_20251016154628.qmd | 502 -------- ...ssions e1 - assumptions_20251016154636.qmd | 502 -------- ...ssions e1 - assumptions_20251016154647.qmd | 502 -------- ...ssions e1 - assumptions_20251016154910.qmd | 504 -------- ...ssions e1 - assumptions_20251016154925.qmd | 516 -------- ...ssions e1 - assumptions_20251016154946.qmd | 516 -------- ...ssions e1 - assumptions_20251016155002.qmd | 516 -------- ...ssions e1 - assumptions_20251016155036.qmd | 518 -------- ...ssions e1 - assumptions_20251016155057.qmd | 518 -------- ...ssions e1 - assumptions_20251016155747.qmd | 518 -------- ...ssions e1 - assumptions_20251016161118.qmd | 519 -------- ...ssions e1 - assumptions_20251016161133.qmd | 517 -------- ...ssions e1 - assumptions_20251016161154.qmd | 517 -------- ...ressions e1 - assumptions_20251016173803.r | 398 ------ ...ressions e1 - assumptions_20251016173806.r | 411 ------- ...ressions e1 - assumptions_20251016173814.r | 411 ------- ...ressions e1 - assumptions_20251016173817.r | 397 ------ ...ressions e1 - assumptions_20251016174209.r | 397 ------ ...ressions e1 - assumptions_20251016174256.r | 403 ------ ...ressions e1 - assumptions_20251016174304.r | 403 ------ ...ressions e1 - assumptions_20251016174308.r | 403 ------ ...ressions e1 - assumptions_20251016174332.r | 403 ------ ...ressions e1 - assumptions_20251016174341.r | 403 ------ ...ressions e1 - assumptions_20251016174344.r | 403 ------ ...ressions e1 - assumptions_20251016174349.r | 403 ------ ...ressions e1 - assumptions_20251016174357.r | 403 ------ ...ressions e1 - assumptions_20251016174402.r | 403 ------ ...ressions e1 - assumptions_20251016174430.r | 402 ------ ...ressions e1 - assumptions_20251016174433.r | 401 ------ ...ressions e1 - assumptions_20251016174436.r | 398 ------ ...ressions e1 - assumptions_20251016174439.r | 396 ------ ...ressions e1 - assumptions_20251016174444.r | 394 ------ ...ressions e1 - assumptions_20251016174451.r | 392 ------ ...ressions e1 - assumptions_20251016174501.r | 390 ------ ...ressions e1 - assumptions_20251016174512.r | 388 ------ ...ressions e1 - assumptions_20251016174519.r | 386 ------ ...ressions e1 - assumptions_20251016174521.r | 384 ------ ...ressions e1 - assumptions_20251016174524.r | 381 ------ ...ressions e1 - assumptions_20251016174526.r | 378 ------ ...ressions e1 - assumptions_20251016174528.r | 375 ------ ...ressions e1 - assumptions_20251016174532.r | 372 ------ ...ressions e1 - assumptions_20251016174537.r | 369 ------ ...ressions e1 - assumptions_20251016174542.r | 366 ------ ...ressions e1 - assumptions_20251016174545.r | 363 ------ ...ressions e1 - assumptions_20251016174548.r | 360 ------ ...ressions e1 - assumptions_20251016174550.r | 357 ------ ...ressions e1 - assumptions_20251016174557.r | 357 ------ ...ressions e1 - assumptions_20251016174601.r | 357 ------ ...ressions e1 - assumptions_20251016175238.r | 367 ------ ...ressions e1 - assumptions_20251016175247.r | 373 ------ ...ressions e1 - assumptions_20251016175258.r | 373 ------ ...ressions e1 - assumptions_20251016175309.r | 373 ------ ...ressions e1 - assumptions_20251016175319.r | 374 ------ ...ressions e1 - assumptions_20251016175325.r | 375 ------ ...ressions e1 - assumptions_20251016175340.r | 375 ------ ...ressions e1 - assumptions_20251016175550.r | 375 ------ ...ressions e1 - assumptions_20251016175653.r | 383 ------ ...ressions e1 - assumptions_20251016175707.r | 383 ------ ...ressions e1 - assumptions_20251016175808.r | 383 ------ ...ressions e1 - assumptions_20251016175912.r | 390 ------ ...ressions e1 - assumptions_20251016175924.r | 390 ------ ...ressions e1 - assumptions_20251016175945.r | 390 ------ ...y_analysis_cronbach_alpha_20250917154720.r | 202 --- ...y_analysis_cronbach_alpha_20250917154729.r | 202 --- ...y_analysis_cronbach_alpha_20250918120701.r | 204 --- .history/eohi1/test_knit_20251004194422.rmd | 22 - .history/eohi1/test_knit_20251004194431.rmd | 22 - .history/eohi1/test_knit_20251004194642.rmd | 1 - ...EADME_Variable_Creation_20251001133606.txt | 425 ------- ...EADME_Variable_Creation_20251001133614.txt | 425 ------- ...EADME_Variable_Creation_20251001133615.txt | 425 ------- ...EADME_Variable_Creation_20251001133634.txt | 425 ------- ...EADME_Variable_Creation_20251001154337.txt | 425 ------- ...EADME_Variable_Creation_20251001154405.txt | 503 -------- ...EADME_Variable_Creation_20251001154412.txt | 504 -------- ...EADME_Variable_Creation_20251001154419.txt | 512 -------- ...EADME_Variable_Creation_20251001154424.txt | 512 -------- ...EADME_Variable_Creation_20251001154430.txt | 512 -------- ...EADME_Variable_Creation_20251001154444.txt | 512 -------- ...EADME_Variable_Creation_20251001155104.txt | 512 -------- ...EADME_Variable_Creation_20251001155126.txt | 512 -------- ...EADME_Variable_Creation_20251008114335.txt | 650 ---------- ...EADME_Variable_Creation_20251008114354.txt | 659 ---------- ...EADME_Variable_Creation_20251008114414.txt | 670 ---------- ...EADME_Variable_Creation_20251008114419.txt | 670 ---------- ...EADME_Variable_Creation_20251008114428.txt | 670 ---------- ...EADME_Variable_Creation_20251008114444.txt | 670 ---------- ...EADME_Variable_Creation_20251008114508.txt | 670 ---------- ...EADME_Variable_Creation_20251008114531.txt | 670 ---------- ...EADME_Variable_Creation_20251008171443.txt | 921 -------------- ...EADME_Variable_Creation_20251008171453.txt | 926 -------------- ...EADME_Variable_Creation_20251008171510.txt | 933 -------------- ...EADME_Variable_Creation_20251008171520.txt | 934 -------------- ...EADME_Variable_Creation_20251008171528.txt | 936 -------------- ...EADME_Variable_Creation_20251008171541.txt | 971 --------------- ...EADME_Variable_Creation_20251008171604.txt | 971 --------------- ...EADME_Variable_Creation_20251008171626.txt | 971 --------------- ...EADME_Variable_Creation_20251008171628.txt | 971 --------------- ...EADME_Variable_Creation_20251029133334.txt | 1044 ---------------- ...EADME_Variable_Creation_20251029133342.txt | 1047 ---------------- ...EADME_Variable_Creation_20251029133348.txt | 1047 ---------------- ...EADME_Variable_Creation_20251029133355.txt | 1047 ---------------- ...EADME_Variable_Creation_20251029133422.txt | 1047 ---------------- ...EADME_Variable_Creation_20251029133433.txt | 1047 ---------------- .../RMD - mixed anova DGEN_20251003190744.rmd | 0 .../RMD - mixed anova DGEN_20251006125956.rmd | 0 ...lation matrix 2 - std ehi_20251029124228.r | 0 ...lation matrix 2 - std ehi_20251029124229.r | 100 -- ...lation matrix 2 - std ehi_20251029124329.r | 100 -- ...ons - domain general vars_20251008122234.r | 184 --- ...ons - domain general vars_20251008122239.r | 184 --- ...ons - domain general vars_20251008122254.r | 184 --- ...ons - domain general vars_20251008122540.r | 175 --- ...ons - domain general vars_20251008122543.r | 175 --- ...ons - domain general vars_20251008122553.r | 175 --- ...ons - domain general vars_20251008122555.r | 175 --- ...ns - domain specific vars_20251008115022.r | 197 --- ...ns - domain specific vars_20251008115035.r | 197 --- ...ns - domain specific vars_20251008115036.r | 197 --- ...ns - domain specific vars_20251008115149.r | 197 --- ...ns - domain specific vars_20251008115152.r | 197 --- ...ns - domain specific vars_20251008115154.r | 197 --- ...ns - domain specific vars_20251008121216.r | 197 --- ...ECT - ehi + DGEN x scales_20251008171931.r | 176 --- ...ECT - ehi + DGEN x scales_20251008171942.r | 176 --- ...ECT - ehi + DGEN x scales_20251008171945.r | 176 --- ...ECT - ehi + DGEN x scales_20251008172011.r | 176 --- ...ECT - ehi + DGEN x scales_20251008172056.r | 176 --- ...ECT - ehi + DGEN x scales_20251008185510.r | 176 --- .../eohi2/dataP - DGEN means_20251001122522.r | 183 --- .../eohi2/dataP - DGEN means_20251001122534.r | 183 --- .../eohi2/dataP - DGEN means_20251001122539.r | 183 --- .../eohi2/dataP - DGEN means_20251001124312.r | 183 --- .../eohi2/dataP - DGEN means_20251001130521.r | 183 --- .../dataP - recode DGEN vars_20251001100032.r | 255 ---- .../dataP - recode DGEN vars_20251001100044.r | 256 ---- .../dataP - recode DGEN vars_20251001100055.r | 256 ---- .../dataP - recode DGEN vars_20251001100142.r | 256 ---- .../dataP - recode DGEN vars_20251001100344.r | 256 ---- .../dataP - recode DGEN vars_20251001100345.r | 256 ---- .../dataP - recode DGEN vars_20251001100404.r | 256 ---- .../dataP - recode DGEN vars_20251001100408.r | 256 ---- .../dataP - recode DGEN vars_20251001100532.r | 255 ---- .../dataP - recode DGEN vars_20251001100534.r | 255 ---- .../dataP - recode DGEN vars_20251001100537.r | 253 ---- .../dataP - recode DGEN vars_20251001100544.r | 253 ---- .../dataP - recode DGEN vars_20251001100547.r | 253 ---- .../dataP - recode DGEN vars_20251001100624.r | 253 ---- .../dataP - recode DGEN vars_20251001105736.r | 253 ---- ...taP - recode present VARS_20251001110617.r | 197 --- ...taP - recode present VARS_20251001110629.r | 197 --- ...taP - recode present VARS_20251001110731.r | 197 --- ...taP - recode present VARS_20251001110926.r | 192 --- ...taP - recode present VARS_20251001110934.r | 192 --- ...taP - recode present VARS_20251001110936.r | 192 --- ...taP - recode present VARS_20251001111008.r | 192 --- ...taP - recode present VARS_20251001111014.r | 192 --- ...taP - recode present VARS_20251001111106.r | 192 --- ...taP - recode present VARS_20251001112101.r | 192 --- ...ataP - recode scales VARS_20251001113407.r | 0 ...ataP - recode scales VARS_20251001113436.r | 4 - ...ataP - recode scales VARS_20251001114903.r | 281 ----- ...ataP - recode scales VARS_20251001114914.r | 281 ----- ...ataP - recode scales VARS_20251001115233.r | 281 ----- ...ataP - recode scales VARS_20251001120138.r | 285 ----- ...ataP - recode scales VARS_20251001120154.r | 298 ----- ...ataP - recode scales VARS_20251001120204.r | 298 ----- ...ataP - recode scales VARS_20251001120217.r | 298 ----- ...ataP - recode scales VARS_20251001120228.r | 298 ----- ...ataP - recode scales VARS_20251001121501.r | 298 ----- ...time interval differences_20251001130451.r | 250 ---- ...time interval differences_20251001130503.r | 250 ---- ...time interval differences_20251001130516.r | 250 ---- ...time interval differences_20251001130613.r | 268 ---- ...time interval differences_20251001130619.r | 268 ---- ...time interval differences_20251001130649.r | 268 ---- ...time interval differences_20251001131102.r | 278 ----- ...time interval differences_20251001131108.r | 280 ----- ...time interval differences_20251001131115.r | 280 ----- ...time interval differences_20251001131121.r | 280 ----- ...time interval differences_20251001131239.r | 280 ----- ...time interval differences_20251001131304.r | 280 ----- ...time interval differences_20251001131331.r | 280 ----- ...time interval differences_20251001131423.r | 280 ----- ...time interval differences_20251001131422.r | 280 ----- ...time interval differences_20251001132410.r | 280 ----- ...time interval differences_20251001132459.r | 290 ----- ...time interval differences_20251001132504.r | 288 ----- ...time interval differences_20251001132511.r | 292 ----- ...time interval differences_20251001132516.r | 292 ----- ...time interval differences_20251001132521.r | 292 ----- ...time interval differences_20251001132530.r | 292 ----- ...time interval differences_20251001132540.r | 292 ----- ...time interval differences_20251001132559.r | 292 ----- ...time interval differences_20251001132623.r | 292 ----- ...time interval differences_20251001132907.r | 292 ----- .../dataP 07 - domain means_20251001152954.r | 0 .../dataP 07 - domain means_20251001153004.r | 4 - .../dataP 07 - domain means_20251001154326.r | 265 ---- .../dataP 07 - domain means_20251001154444.r | 265 ---- .../dataP 07 - domain means_20251001155057.r | 265 ---- .../dataP 07 - domain means_20251001162547.r | 265 ---- .../dataP 07 - domain means_20251001163148.r | 265 ---- .../dataP 07 - domain means_20251008193329.r | 265 ---- .../dataP 07 - domain means_20251008193335.r | 265 ---- .../dataP 07 - domain means_20251008193341.r | 265 ---- .../dataP 07 - domain means_20251008193347.r | 265 ---- .../dataP 07 - domain means_20251008193352.r | 265 ---- .../dataP 07 - domain means_20251008193557.r | 265 ---- .../dataP 07 - domain means_20251008193600.r | 265 ---- .../dataP 08 - DGEN 510 vars_20251006194349.r | 94 -- .../dataP 08 - DGEN 510 vars_20251006194411.r | 94 -- .../dataP 08 - DGEN 510 vars_20251006194451.r | 94 -- .../dataP 08 - DGEN 510 vars_20251006195055.r | 95 -- .../dataP 08 - DGEN 510 vars_20251006195109.r | 95 -- .../dataP 08 - DGEN 510 vars_20251006195118.r | 95 -- .../dataP 08 - DGEN 510 vars_20251006195128.r | 95 -- .../dataP 08 - DGEN 510 vars_20251006195318.r | 95 -- ...nterval x direction means_20251008113501.r | 223 ---- ...nterval x direction means_20251008113518.r | 223 ---- ...nterval x direction means_20251008113613.r | 223 ---- ...dataP 10 - DGEN mean vars_20251008121818.r | 115 -- ...dataP 10 - DGEN mean vars_20251008121822.r | 115 -- ...dataP 10 - DGEN mean vars_20251008121849.r | 115 -- ...taP 11 - CORRECT ehi vars_20251008152253.r | 0 ...taP 11 - CORRECT ehi vars_20251008163033.r | 0 ...taP 11 - CORRECT ehi vars_20251008163045.r | 5 - ...taP 11 - CORRECT ehi vars_20251008163815.r | 235 ---- ...taP 11 - CORRECT ehi vars_20251008163817.r | 235 ---- ...2 - CORRECT DGEN ehi vars_20251008164446.r | 0 ...2 - CORRECT DGEN ehi vars_20251008164447.r | 235 ---- ...2 - CORRECT DGEN ehi vars_20251008164616.r | 179 --- ...2 - CORRECT DGEN ehi vars_20251008164646.r | 159 --- ...2 - CORRECT DGEN ehi vars_20251008164712.r | 118 -- ...2 - CORRECT DGEN ehi vars_20251008164735.r | 118 -- ...2 - CORRECT DGEN ehi vars_20251008164934.r | 118 -- ...ehi domain specific means_20251008165421.r | 161 --- ...ehi domain specific means_20251008165443.r | 161 --- ...ehi domain specific means_20251008165448.r | 161 --- ...ehi domain specific means_20251008165551.r | 161 --- ...ehi domain specific means_20251008170838.r | 161 --- ...14 - all ehi global means_20251008171057.r | 142 --- ...14 - all ehi global means_20251008171120.r | 150 --- ...14 - all ehi global means_20251008171136.r | 140 --- ...14 - all ehi global means_20251008171157.r | 140 --- ...14 - all ehi global means_20251008171250.r | 140 --- ...ucation recoded ordinal 3_20251027135156.r | 0 ...ucation recoded ordinal 3_20251027135157.r | 38 - ...ucation recoded ordinal 3_20251027141418.r | 38 - ...ucation recoded ordinal 3_20251027143845.r | 38 - ... - ehi vars standardized _20251029120227.r | 0 ... - ehi vars standardized _20251029120228.r | 7 - ... - ehi vars standardized _20251029120234.r | 7 - ... - ehi vars standardized _20251029121728.r | 49 - ... - ehi vars standardized _20251029122336.r | 99 -- ... - ehi vars standardized _20251029124145.r | 100 -- .../e2 - correlation matrix_20251027143921.r | 0 .../e2 - correlation matrix_20251027143922.r | 105 -- .../e2 - correlation matrix_20251027144718.r | 105 -- .../e2 - correlation matrix_20251027145122.r | 100 -- .../e2 - correlation matrix_20251027145125.r | 100 -- .../e2 - correlation matrix_20251027145133.r | 100 -- .../eohi2/mixed anova - DGEN_20251003144019.r | 0 .../eohi2/mixed anova - DGEN_20251003144020.r | 21 - .../eohi2/mixed anova - DGEN_20251003150009.r | 169 --- .../eohi2/mixed anova - DGEN_20251003150038.r | 360 ------ .../eohi2/mixed anova - DGEN_20251003150047.r | 360 ------ .../eohi2/mixed anova - DGEN_20251003150106.r | 466 ------- .../eohi2/mixed anova - DGEN_20251003150128.r | 466 ------- .../eohi2/mixed anova - DGEN_20251003150130.r | 466 ------- .../eohi2/mixed anova - DGEN_20251003150137.r | 466 ------- .../eohi2/mixed anova - DGEN_20251003150143.r | 735 ----------- .../eohi2/mixed anova - DGEN_20251003150144.r | 735 ----------- .../eohi2/mixed anova - DGEN_20251003150214.r | 735 ----------- .../eohi2/mixed anova - DGEN_20251003150251.r | 736 ----------- .../eohi2/mixed anova - DGEN_20251003150313.r | 747 ----------- .../eohi2/mixed anova - DGEN_20251003150330.r | 759 ------------ .../eohi2/mixed anova - DGEN_20251003150337.r | 751 ----------- .../eohi2/mixed anova - DGEN_20251003150346.r | 751 ----------- .../eohi2/mixed anova - DGEN_20251003152314.r | 751 ----------- .../eohi2/mixed anova - DGEN_20251003170643.r | 751 ----------- .../eohi2/mixed anova - DGEN_20251003170646.r | 751 ----------- .../eohi2/mixed anova - DGEN_20251003170651.r | 751 ----------- .../eohi2/mixed anova - DGEN_20251003170811.r | 751 ----------- .../eohi2/mixed anova - DGEN_20251003171114.r | 751 ----------- .../eohi2/mixed anova - DGEN_20251003171117.r | 751 ----------- .../eohi2/mixed anova - DGEN_20251003171121.r | 751 ----------- .../eohi2/mixed anova - DGEN_20251003171149.r | 751 ----------- .../eohi2/mixed anova - DGEN_20251006125954.r | 751 ----------- .../eohi2/mixed anova - DGEN_20251006191142.r | 751 ----------- .../eohi2/mixed anova - DGEN_20251006192531.r | 748 ----------- .../eohi2/mixed anova - DGEN_20251006192540.r | 748 ----------- .../eohi2/mixed anova - DGEN_20251006192548.r | 740 ----------- .../eohi2/mixed anova - DGEN_20251006192554.r | 739 ----------- .../eohi2/mixed anova - DGEN_20251006192606.r | 726 ----------- .../eohi2/mixed anova - DGEN_20251006192619.r | 724 ----------- .../eohi2/mixed anova - DGEN_20251006192629.r | 723 ----------- .../eohi2/mixed anova - DGEN_20251006192639.r | 719 ----------- .../eohi2/mixed anova - DGEN_20251006192701.r | 713 ----------- .../eohi2/mixed anova - DGEN_20251006192716.r | 698 ----------- .../eohi2/mixed anova - DGEN_20251006192735.r | 685 ----------- .../eohi2/mixed anova - DGEN_20251006192745.r | 684 ----------- .../eohi2/mixed anova - DGEN_20251006192802.r | 680 ---------- .../eohi2/mixed anova - DGEN_20251006192817.r | 677 ---------- .../eohi2/mixed anova - DGEN_20251006192843.r | 751 ----------- .../eohi2/mixed anova - DGEN_20251006192917.r | 751 ----------- .../eohi2/mixed anova - DGEN_20251006192940.r | 751 ----------- .../eohi2/mixed anova - DGEN_20251006193221.r | 747 ----------- .../eohi2/mixed anova - DGEN_20251006193234.r | 747 ----------- .../eohi2/mixed anova - DGEN_20251006193243.r | 747 ----------- .../eohi2/mixed anova - DGEN_20251006193252.r | 747 ----------- .../eohi2/mixed anova - DGEN_20251006193310.r | 747 ----------- .../eohi2/mixed anova - DGEN_20251006193311.r | 747 ----------- .../eohi2/mixed anova - DGEN_20251006193509.r | 747 ----------- .../eohi2/mixed anova - DGEN_20251006193537.r | 771 ------------ .../eohi2/mixed anova - DGEN_20251006193540.r | 747 ----------- .../eohi2/mixed anova - DGEN_20251006195204.r | 751 ----------- .../eohi2/mixed anova - DGEN_20251006195215.r | 751 ----------- .../eohi2/mixed anova - DGEN_20251006195225.r | 751 ----------- .../eohi2/mixed anova - DGEN_20251006195236.r | 751 ----------- .../eohi2/mixed anova - DGEN_20251006195256.r | 751 ----------- .../eohi2/mixed anova - DGEN_20251006195344.r | 751 ----------- .../eohi2/mixed anova - DGEN_20251006195408.r | 755 ------------ .../eohi2/mixed anova - DGEN_20251006195422.r | 755 ------------ .../eohi2/mixed anova - DGEN_20251006195559.r | 755 ------------ .../eohi2/mixed anova - DGEN_20251006200505.r | 755 ------------ .../eohi2/mixed anova - DGEN_20251006200514.r | 755 ------------ .../eohi2/mixed anova - DGEN_20251006200516.r | 755 ------------ .../eohi2/mixed anova - DGEN_20251006201016.r | 755 ------------ .../eohi2/mixed anova - DGEN_20251006201021.r | 755 ------------ .../eohi2/mixed anova - DGEN_20251006201023.r | 755 ------------ .../eohi2/mixed anova - DGEN_20251006201057.r | 755 ------------ .../eohi2/mixed anova - DGEN_20251006230411.r | 755 ------------ .../eohi2/mixed anova - DGEN_20251006231257.r | 775 ------------ .../eohi2/mixed anova - DGEN_20251006231308.r | 830 ------------- .../eohi2/mixed anova - DGEN_20251006231314.r | 830 ------------- .../eohi2/mixed anova - DGEN_20251006231325.r | 830 ------------- .../eohi2/mixed anova - DGEN_20251006231533.r | 899 -------------- .../eohi2/mixed anova - DGEN_20251006231538.r | 899 -------------- .../eohi2/mixed anova - DGEN_20251006232531.r | 1035 ---------------- .../eohi2/mixed anova - DGEN_20251006232538.r | 1035 ---------------- .../eohi2/mixed anova - DGEN_20251006232540.r | 1035 ---------------- .../eohi2/mixed anova - DGEN_20251007103206.r | 1093 ----------------- .../eohi2/mixed anova - DGEN_20251007103213.r | 1093 ----------------- .../eohi2/mixed anova - DGEN_20251007103739.r | 1093 ----------------- .../eohi2/mixed anova - DGEN_20251007104105.r | 892 -------------- .../eohi2/mixed anova - DGEN_20251007104111.r | 892 -------------- .../eohi2/mixed anova - DGEN_20251007104119.r | 892 -------------- .../eohi2/mixed anova - DGEN_20251007105736.r | 892 -------------- .../eohi2/mixed anova - DGEN_20251007185029.r | 892 -------------- .../eohi2/mixed anova - DGEN_20251007185541.r | 892 -------------- .../eohi2/mixed anova - DGEN_20251007192720.r | 892 -------------- .../eohi2/mixed anova - DGEN_20251008190007.r | 892 -------------- .../eohi2/mixed anova - DGEN_20251008190301.r | 896 -------------- .../eohi2/mixed anova - DGEN_20251008190307.r | 896 -------------- .../eohi2/mixed anova - DGEN_20251010141129.r | 896 -------------- .../eohi2/mixed anova - DGEN_20251010160100.r | 896 -------------- .../eohi2/mixed anova - DGEN_20251010165028.r | 896 -------------- .../eohi2/mixed anova - DGEN_20251010165032.r | 896 -------------- ...ixed anova - domain means_20251003143914.r | 0 ...ixed anova - domain means_20251003143942.r | 21 - ...ixed anova - domain means_20251003145806.r | 88 -- ...ixed anova - domain means_20251003145820.r | 139 --- ...ixed anova - domain means_20251003145845.r | 280 ----- ...ixed anova - domain means_20251003145918.r | 434 ------- ...ixed anova - domain means_20251003145949.r | 611 --------- ...ixed anova - domain means_20251003145955.r | 611 --------- ...ixed anova - domain means_20251003150004.r | 611 --------- ...ixed anova - domain means_20251003150017.r | 611 --------- ...ixed anova - domain means_20251003150021.r | 611 --------- ...ixed anova - domain means_20251003150023.r | 611 --------- ...ixed anova - domain means_20251003150026.r | 611 --------- ...ixed anova - domain means_20251003150029.r | 611 --------- ...ixed anova - domain means_20251003150032.r | 611 --------- ...ixed anova - domain means_20251003150034.r | 611 --------- ...ixed anova - domain means_20251003150037.r | 611 --------- ...ixed anova - domain means_20251003150049.r | 611 --------- ...ixed anova - domain means_20251003150056.r | 611 --------- ...ixed anova - domain means_20251003150107.r | 611 --------- ...ixed anova - domain means_20251003150118.r | 611 --------- ...ixed anova - domain means_20251003150155.r | 611 --------- ...ixed anova - domain means_20251003150222.r | 611 --------- ...ixed anova - domain means_20251003150238.r | 611 --------- ...ixed anova - domain means_20251003150306.r | 611 --------- ...ixed anova - domain means_20251003152345.r | 748 ----------- ...ixed anova - domain means_20251003152354.r | 748 ----------- ...ixed anova - domain means_20251003152532.r | 748 ----------- ...ixed anova - domain means_20251003170438.r | 748 ----------- ...ixed anova - domain means_20251006191145.r | 748 ----------- ...ixed anova - domain means_20251006191927.r | 822 ------------- ...ixed anova - domain means_20251006191941.r | 822 ------------- ...ixed anova - domain means_20251006191952.r | 822 ------------- ...ixed anova - domain means_20251006192942.r | 822 ------------- ...ixed anova - domain means_20251006225442.r | 822 ------------- ...ixed anova - domain means_20251007105958.r | 878 ------------- ...ixed anova - domain means_20251007110011.r | 878 ------------- ...ixed anova - domain means_20251007110320.r | 878 ------------- ...ixed anova - domain means_20251007110333.r | 876 ------------- ...ixed anova - domain means_20251007110641.r | 872 ------------- ...ixed anova - domain means_20251007110654.r | 872 ------------- ...ixed anova - domain means_20251007111158.r | 872 ------------- ...ixed anova - domain means_20251007155225.r | 872 ------------- ...ixed anova - domain means_20251007184341.r | 872 ------------- ...ixed anova - domain means_20251007184520.r | 941 -------------- ...ixed anova - domain means_20251007184536.r | 941 -------------- ...ixed anova - domain means_20251007184540.r | 941 -------------- ...ixed anova - domain means_20251007184820.r | 923 -------------- ...ixed anova - domain means_20251007184823.r | 923 -------------- ...ixed anova - domain means_20251007184824.r | 923 -------------- ...ixed anova - domain means_20251007185023.r | 923 -------------- ...ixed anova - domain means_20251007185544.r | 923 -------------- ...ixed anova - domain means_20251008192902.r | 923 -------------- ...ixed anova - domain means_20251008192910.r | 927 -------------- ...ixed anova - domain means_20251008192916.r | 927 -------------- ...ixed anova - domain means_20251008192921.r | 927 -------------- ...ixed anova - domain means_20251008192926.r | 927 -------------- .../recode_likert_items_20251001085552.r | 93 -- .../recode_likert_items_20251001085616.r | 93 -- .../recode_likert_items_20251001085904.r | 95 -- .../recode_likert_items_20251001085909.r | 95 -- .../recode_likert_items_20251001085917.r | 95 -- .../recode_likert_items_20251001090003.r | 95 -- .../recode_likert_items_20251001090613.r | 99 -- .../recode_likert_items_20251001090620.r | 99 -- .../recode_likert_items_20251001090829.r | 164 --- .../recode_likert_items_20251001090840.r | 164 --- .../recode_likert_items_20251001090919.r | 164 --- .../recode_likert_items_20251001091002.r | 191 --- .../recode_likert_items_20251001091011.r | 191 --- .../recode_likert_items_20251001091016.r | 191 --- .../recode_likert_items_20251001091544.r | 191 --- .../recode_likert_items_20251001091551.r | 198 --- .../recode_likert_items_20251001091602.r | 198 --- .../recode_likert_items_20251001091838.r | 199 --- .../recode_likert_items_20251001091844.r | 200 --- .../recode_likert_items_20251001091846.r | 200 --- .../recode_likert_items_20251001091852.r | 200 --- .../recode_likert_items_20251001092353.r | 200 --- .../recode_likert_items_20251001092409.r | 266 ---- .../recode_likert_items_20251001092420.r | 266 ---- .../recode_likert_items_20251001092430.r | 266 ---- .../recode_likert_items_20251001094502.r | 266 ---- .../recode_likert_items_20251001105906.r | 266 ---- .../eohi2/reliability - ehi_20251028173023.r | 0 .../eohi2/reliability - ehi_20251028173024.r | 9 - .../eohi2/reliability - ehi_20251028173102.r | 67 - .../eohi2/reliability - ehi_20251028173106.r | 67 - .../eohi2/reliability - ehi_20251028173223.r | 67 - .../eohi2/reliability - ehi_20251028173226.r | 67 - .../eohi2/reliability - ehi_20251028173245.r | 56 - .../eohi2/reliability - ehi_20251028173247.r | 56 - .../eohi2/reliability - ehi_20251028173249.r | 56 - .../eohi2/reliability - ehi_20251028174139.r | 84 -- .../eohi2/reliability - ehi_20251028174146.r | 84 -- .../eohi2/reliability - ehi_20251028174151.r | 84 -- .../eohi2/reliability - ehi_20251028174152.r | 84 -- .../eohi2/reliability - ehi_20251028174209.r | 84 -- .../eohi2/reliability - ehi_20251028174329.r | 90 -- .../eohi2/reliability - ehi_20251028174333.r | 90 -- .../eohi2/reliability - ehi_20251028174401.r | 80 -- .../eohi2/reliability - ehi_20251028174407.r | 80 -- .../eohi2/reliability - ehi_20251028174412.r | 80 -- .../eohi2/reliability - ehi_20251028174501.r | 87 -- .../eohi2/reliability - ehi_20251028174503.r | 87 -- .../eohi2/reliability - ehi_20251028174701.r | 87 -- .../eohi2/reliability - ehi_20251028174922.r | 88 -- .../eohi2/reliability - ehi_20251028174926.r | 88 -- .../eohi2/reliability - ehi_20251028174958.r | 88 -- .../eohi2/reliability - ehi_20251028175001.r | 88 -- .../eohi2/reliability - ehi_20251028175110.r | 88 -- .../eohi2/reliability - ehi_20251028175138.r | 88 -- .../eohi2/reliability - ehi_20251028175140.r | 88 -- .../eohi2/reliability - ehi_20251028175150.r | 87 -- .../eohi2/reliability - ehi_20251028175152.r | 87 -- .../eohi2/reliability - ehi_20251028175211.r | 86 -- .../eohi2/reliability - ehi_20251028175214.r | 86 -- .../eohi2/reliability - ehi_20251028175223.r | 61 - .../eohi2/reliability - ehi_20251028175227.r | 58 - .../eohi2/reliability - ehi_20251028175230.r | 58 - .../eohi2/reliability - ehi_20251028175322.r | 58 - .../eohi2/reliability - ehi_20251028175530.r | 61 - .../eohi2/reliability - ehi_20251028175535.r | 61 - .../eohi2/reliability - ehi_20251028175559.r | 61 - .../eohi2/reliability - ehi_20251028175617.r | 58 - .../eohi2/reliability - ehi_20251028175618.r | 58 - .../eohi2/reliability - ehi_20251028175644.r | 58 - .../eohi2/reliability - ehi_20251028175648.r | 58 - .../eohi2/reliability - ehi_20251028175651.r | 58 - .../eohi2/reliability - ehi_20251028180009.r | 56 - .../eohi2/reliability - ehi_20251028180012.r | 53 - .../eohi2/reliability - ehi_20251028180015.r | 53 - .../eohi2/reliability - ehi_20251028180017.r | 53 - .../eohi2/reliability - ehi_20251028180125.r | 56 - .../eohi2/reliability - ehi_20251028180127.r | 56 - .../eohi2/reliability - ehi_20251028180136.r | 56 - .../reliability analysis_20251027165337.r | 0 .../reliability analysis_20251027165338.r | 7 - .../reliability analysis_20251028141004.r | 90 -- .../reliability analysis_20251028141504.r | 130 -- .../reliability analysis_20251028144222.r | 87 -- .../reliability analysis_20251028151252.r | 121 -- .../reliability analysis_20251028151259.r | 129 -- .../reliability analysis_20251028151305.r | 129 -- .../reliability analysis_20251028151323.r | 129 -- .../reliability analysis_20251028151326.r | 127 -- .../reliability analysis_20251028151330.r | 121 -- .../reliability analysis_20251028151333.r | 121 -- .../reliability analysis_20251028151339.r | 121 -- .../reliability analysis_20251028151358.r | 121 -- .../reliability analysis_20251028151540.r | 123 -- .../reliability analysis_20251028151544.r | 123 -- .../reliability analysis_20251028151808.r | 121 -- .../reliability analysis_20251028151819.r | 121 -- .../reliability analysis_20251028151900.r | 121 -- .../reliability analysis_20251028151917.r | 140 --- .../reliability analysis_20251028151922.r | 140 --- .../reliability analysis_20251028151925.r | 140 --- .../reliability analysis_20251028152017.r | 141 --- .../reliability analysis_20251028152020.r | 141 --- .../reliability analysis_20251028152033.r | 141 --- .../reliability analysis_20251028152127.r | 145 --- .../reliability analysis_20251028152132.r | 148 --- .../reliability analysis_20251028152134.r | 148 --- .../reliability analysis_20251028152156.r | 148 --- .../reliability analysis_20251028161222.r | 140 --- .../reliability analysis_20251028161229.r | 134 -- .../reliability analysis_20251028161236.r | 123 -- .../reliability analysis_20251028161240.r | 94 -- .../reliability analysis_20251028161242.r | 91 -- .../reliability analysis_20251028161247.r | 91 -- .../reliability analysis_20251028161950.r | 91 -- .../reliability analysis_20251028162118.r | 110 -- .../reliability analysis_20251028162136.r | 113 -- .../reliability analysis_20251028162145.r | 113 -- .../reliability analysis_20251028162201.r | 113 -- .../reliability analysis_20251028162240.r | 114 -- .../reliability analysis_20251028162243.r | 114 -- .../reliability analysis_20251028162251.r | 114 -- .../reliability analysis_20251028162450.r | 175 --- .../reliability analysis_20251028162455.r | 175 --- .../reliability analysis_20251028162504.r | 175 --- .../reliability analysis_20251028162602.r | 175 --- .../reliability analysis_20251028162606.r | 175 --- .../reliability analysis_20251028162608.r | 175 --- .../reliability analysis_20251028162820.r | 134 -- .../reliability analysis_20251028162852.r | 134 -- .../reliability analysis_20251028162935.r | 142 --- .../reliability analysis_20251028162951.r | 155 --- .../reliability analysis_20251028162955.r | 155 --- .../reliability analysis_20251028163006.r | 155 --- .../reliability analysis_20251028163158.r | 158 --- .../reliability analysis_20251028163211.r | 164 --- .../reliability analysis_20251028163215.r | 164 --- .../reliability analysis_20251028163222.r | 164 --- .../reliability analysis_20251028163310.r | 165 --- .../reliability analysis_20251028163331.r | 186 --- .../reliability analysis_20251028163339.r | 203 --- .../reliability analysis_20251028163348.r | 203 --- .../reliability analysis_20251028163622.r | 203 --- .../reliability analysis_20251028164143.r | 231 ---- .../reliability analysis_20251028164153.r | 231 ---- .../reliability analysis_20251028164158.r | 231 ---- .../reliability analysis_20251028164324.r | 231 ---- .../reliability analysis_20251028164342.r | 235 ---- .../reliability analysis_20251028164345.r | 239 ---- .../reliability analysis_20251028164350.r | 239 ---- .../reliability analysis_20251028164357.r | 239 ---- .../reliability analysis_20251028164422.r | 243 ---- .../reliability analysis_20251028164426.r | 243 ---- .../reliability analysis_20251028164514.r | 243 ---- .../reliability analysis_20251028164546.r | 239 ---- .../reliability analysis_20251028164551.r | 239 ---- .../reliability analysis_20251028164558.r | 239 ---- .../reliability analysis_20251028164641.r | 244 ---- .../reliability analysis_20251028164650.r | 244 ---- .../reliability analysis_20251028164657.r | 244 ---- .../reliability analysis_20251028164813.r | 248 ---- .../reliability analysis_20251028164819.r | 253 ---- .../reliability analysis_20251028164821.r | 253 ---- .../reliability analysis_20251028165006.r | 253 ---- .../reliability analysis_20251028165109.r | 257 ---- .../reliability analysis_20251028165118.r | 262 ---- .../reliability analysis_20251028165120.r | 262 ---- .../reliability analysis_20251028165128.r | 262 ---- .../reliability analysis_20251028165402.r | 262 ---- .../reliability analysis_20251028165410.r | 262 ---- .../reliability analysis_20251028165435.r | 262 ---- .../reliability analysis_20251028170716.r | 265 ---- .../reliability analysis_20251028170719.r | 274 ----- .../reliability analysis_20251028170725.r | 274 ----- .../reliability analysis_20251028170741.r | 274 ----- .../reliability analysis_20251028170822.r | 280 ----- .../reliability analysis_20251028170831.r | 280 ----- .../reliability analysis_20251028170832.r | 280 ----- .../reliability analysis_20251028170921.r | 289 ----- .../reliability analysis_20251028170923.r | 289 ----- .../reliability analysis_20251028170935.r | 289 ----- .../reliability analysis_20251028171635.r | 289 ----- ...liability_summary_table_20251028144409.csv | 16 - ...liability_summary_table_20251028144438.csv | 16 - ...liability_summary_table_20251028173027.csv | 16 - .history/eohi2/verify_means_20251008115109.R | 68 - ...ixed anova - domain means_20250912124308.r | 572 --------- ...ixed anova - domain means_20250912124317.r | 572 --------- ...ixed anova - domain means_20250912124407.r | 572 --------- ...ixed anova - domain means_20250912124620.r | 571 --------- ...ixed anova - domain means_20250912125000.r | 567 --------- ...ixed anova - domain means_20250912125003.r | 567 --------- ...ixed anova - domain means_20250912125007.r | 573 --------- ...ixed anova - domain means_20250912125012.r | 580 --------- ...ixed anova - domain means_20250912125017.r | 580 --------- ...ixed anova - domain means_20250912125031.r | 584 --------- ...ixed anova - domain means_20250912125040.r | 584 --------- ...ixed anova - domain means_20250912125046.r | 584 --------- ...ixed anova - domain means_20250912130804.r | 131 -- ...ixed anova - domain means_20250912130809.r | 124 -- ...ixed anova - domain means_20250912130812.r | 124 -- ...ixed anova - domain means_20250912130822.r | 117 -- ...ixed anova - domain means_20250912130828.r | 117 -- ...ixed anova - domain means_20250912130829.r | 117 -- ...ixed anova - domain means_20250912144754.r | 115 -- ...ixed anova - domain means_20250912152948.r | 272 ---- ...ixed anova - domain means_20250912152953.r | 272 ---- ...ixed anova - domain means_20250912153103.r | 272 ---- .../mixed anova - ind item_20250912123133.r | 0 .../mixed anova - ind item_20250912123134.r | 397 ------ .history/mixed anova_20250912110917.r | 283 ----- .history/mixed anova_20250912110922.r | 283 ----- .history/mixed anova_20250912110938.r | 283 ----- .history/mixed anova_20250912111710.r | 281 ----- .history/mixed anova_20250912111725.r | 324 ----- .history/mixed anova_20250912111750.r | 316 ----- .history/mixed anova_20250912111819.r | 341 ----- .history/mixed anova_20250912111827.r | 341 ----- .history/mixed anova_20250912112005.r | 341 ----- .history/mixed anova_20250912112141.r | 341 ----- .history/mixed anova_20250912112319.r | 347 ------ .history/mixed anova_20250912112327.r | 347 ------ .history/mixed anova_20250912112334.r | 347 ------ .history/mixed anova_20250912112620.r | 349 ------ .history/mixed anova_20250912112624.r | 349 ------ .history/mixed anova_20250912112628.r | 349 ------ .history/mixed anova_20250912112631.r | 349 ------ .history/mixed anova_20250912112632.r | 349 ------ .history/mixed anova_20250912112635.r | 349 ------ .history/mixed anova_20250912112638.r | 349 ------ .history/mixed anova_20250912112641.r | 349 ------ .history/mixed anova_20250912112645.r | 349 ------ .history/mixed anova_20250912112651.r | 349 ------ .history/mixed anova_20250912112700.r | 349 ------ .history/mixed anova_20250912113047.r | 359 ------ .history/mixed anova_20250912113055.r | 359 ------ .history/mixed anova_20250912113119.r | 359 ------ .history/mixed anova_20250912113345.r | 374 ------ .history/mixed anova_20250912113353.r | 374 ------ .history/mixed anova_20250912113448.r | 377 ------ .history/mixed anova_20250912113455.r | 377 ------ .history/mixed anova_20250912113502.r | 377 ------ .history/mixed anova_20250912113654.r | 370 ------ .history/mixed anova_20250912113659.r | 370 ------ .history/mixed anova_20250912113707.r | 370 ------ .history/mixed anova_20250912114051.r | 384 ------ .history/mixed anova_20250912114100.r | 384 ------ .history/mixed anova_20250912114105.r | 384 ------ .history/mixed anova_20250912114140.r | 384 ------ .history/mixed anova_20250912114154.r | 384 ------ .history/mixed anova_20250912114156.r | 384 ------ .history/mixed anova_20250912114707.r | 398 ------ .history/mixed anova_20250912114714.r | 398 ------ .history/mixed anova_20250912124604.r | 398 ------ 1401 files changed, 1 insertion(+), 554716 deletions(-) create mode 100644 .gitignore delete mode 100644 .history/eohi1/BS_means_20250922131352.vb delete mode 100644 .history/eohi1/BS_means_20250922131356.vb delete mode 100644 .history/eohi1/BS_means_20250922131406.vb delete mode 100644 .history/eohi1/DataP 01 - domain mean totals _20251007232436.r delete mode 100644 .history/eohi1/DataP 01 - domain mean totals _20251007232440.r delete mode 100644 .history/eohi1/DataP 01 - domain mean totals _20251007232447.r delete mode 100644 .history/eohi1/DataP 01 - domain mean totals _20251007232757.r delete mode 100644 .history/eohi1/RMD exp1 - mixed anova domain means_20251004193424.rmd delete mode 100644 .history/eohi1/RMD exp1 - mixed anova domain means_20251004193431.rmd delete mode 100644 .history/eohi1/RMD exp1 - mixed anova domain means_20251004193438.rmd delete mode 100644 .history/eohi1/RMD exp1 - mixed anova domain means_20251004193443.rmd delete mode 100644 .history/eohi1/RMD exp1 - mixed anova domain means_20251004193454.rmd delete mode 100644 .history/eohi1/RMD exp1 - mixed anova domain means_20251004193500.rmd delete mode 100644 .history/eohi1/RMD exp1 - mixed anova domain means_20251004193511.rmd delete mode 100644 .history/eohi1/RMD exp1 - mixed anova domain means_20251004193523.rmd delete mode 100644 .history/eohi1/RMD exp1 - mixed anova domain means_20251004193543.rmd delete mode 100644 .history/eohi1/RMD exp1 - mixed anova domain means_20251004193554.rmd delete mode 100644 .history/eohi1/RMD exp1 - mixed anova domain means_20251004193559.rmd delete mode 100644 .history/eohi1/RMD exp1 - mixed anova domain means_20251004193613.rmd delete mode 100644 .history/eohi1/RMD exp1 - mixed anova domain means_20251004193658.rmd delete mode 100644 .history/eohi1/Untitled-1_20251020173338.r delete mode 100644 .history/eohi1/Untitled-1_20251020173353.r delete mode 100644 .history/eohi1/assumption_checks_before_cronbach_20250917154857.r delete mode 100644 .history/eohi1/assumption_checks_before_cronbach_20250917154901.r delete mode 100644 .history/eohi1/assumption_checks_before_cronbach_20250918115459.r delete mode 100644 .history/eohi1/brierVARS_20250922124859.vb delete mode 100644 .history/eohi1/brierVARS_20250922124900.vb delete mode 100644 .history/eohi1/brierVARS_20250922125338.vb delete mode 100644 .history/eohi1/brierVARS_20250922125353.vb delete mode 100644 .history/eohi1/brierVARS_20250922125648.vb delete mode 100644 .history/eohi1/brierVARS_20250922125759.vb delete mode 100644 .history/eohi1/brierVARS_20250922125807.vb delete mode 100644 .history/eohi1/brierVARS_20250922125821.vb delete mode 100644 .history/eohi1/brierVARS_20250922130226.vb delete mode 100644 .history/eohi1/brierVARS_20250922130242.vb delete mode 100644 .history/eohi1/brierVARS_20250922130423.vb delete mode 100644 .history/eohi1/brierVARS_20250922130433.vb delete mode 100644 .history/eohi1/brierVARS_20250922130435.vb delete mode 100644 .history/eohi1/brierVARS_20250922131020.vb delete mode 100644 .history/eohi1/correlation matrix_20251027115053.r delete mode 100644 .history/eohi1/correlation matrix_20251027115054.r delete mode 100644 .history/eohi1/correlation matrix_20251027115900.r delete mode 100644 .history/eohi1/correlation matrix_20251027115902.r delete mode 100644 .history/eohi1/correlation matrix_20251027115905.r delete mode 100644 .history/eohi1/correlation matrix_20251027120022.r delete mode 100644 .history/eohi1/correlation matrix_20251027120056.r delete mode 100644 .history/eohi1/correlation matrix_20251027120100.r delete mode 100644 .history/eohi1/correlation matrix_20251027120122.r delete mode 100644 .history/eohi1/correlation matrix_20251027120345.r delete mode 100644 .history/eohi1/correlation matrix_20251027120348.r delete mode 100644 .history/eohi1/correlation matrix_20251027120351.r delete mode 100644 .history/eohi1/correlation matrix_20251027120448.r delete mode 100644 .history/eohi1/correlation matrix_20251027120450.r delete mode 100644 .history/eohi1/correlation matrix_20251027120505.r delete mode 100644 .history/eohi1/correlation matrix_20251027120720.r delete mode 100644 .history/eohi1/correlation matrix_20251027120722.r delete mode 100644 .history/eohi1/correlation matrix_20251027120752.r delete mode 100644 .history/eohi1/correlation matrix_20251027120754.r delete mode 100644 .history/eohi1/correlation matrix_20251027120804.r delete mode 100644 .history/eohi1/correlation matrix_20251027120919.r delete mode 100644 .history/eohi1/correlation matrix_20251027120930.r delete mode 100644 .history/eohi1/correlation matrix_20251027120933.r delete mode 100644 .history/eohi1/correlation matrix_20251027120955.r delete mode 100644 .history/eohi1/correlation matrix_20251027120958.r delete mode 100644 .history/eohi1/correlation matrix_20251027121016.r delete mode 100644 .history/eohi1/correlation matrix_20251027134544.r delete mode 100644 .history/eohi1/correlation matrix_20251029115844.r delete mode 100644 .history/eohi1/correlations - brier score x eohi and cal_20250922132833.r delete mode 100644 .history/eohi1/correlations - brier score x eohi and cal_20250922132837.r delete mode 100644 .history/eohi1/correlations - brier score x eohi and cal_20250922132923.r delete mode 100644 .history/eohi1/correlations - brier score x eohi and cal_20250922132932.r delete mode 100644 .history/eohi1/correlations - brier score x eohi and cal_20250922132938.r delete mode 100644 .history/eohi1/correlations - brier score x eohi and cal_20250922133020.r delete mode 100644 .history/eohi1/correlations - brier score x eohi and cal_20250922133026.r delete mode 100644 .history/eohi1/correlations - brier score x eohi and cal_20250922133028.r delete mode 100644 .history/eohi1/correlations - brier score x eohi and cal_20250922133336.r delete mode 100644 .history/eohi1/correlations - brier score x eohi and cal_20250922133339.r delete mode 100644 .history/eohi1/correlations - brier score x eohi and cal_20250922133341.r delete mode 100644 .history/eohi1/correlations - brier score x eohi and cal_20250922134044.r delete mode 100644 .history/eohi1/correlations - brier score x eohi and cal_20250922135638.r delete mode 100644 .history/eohi1/correlations - brier score x eohi and cal_20251008160336.r delete mode 100644 .history/eohi1/correlations - brier score x eohi and cal_20251008160455.r delete mode 100644 .history/eohi1/correlations - brier score x eohi and cal_20251008160500.r delete mode 100644 .history/eohi1/correlations - brier score x eohi and cal_20251008160505.r delete mode 100644 .history/eohi1/correlations - brier score x eohi and cal_20251008160539.r delete mode 100644 .history/eohi1/correlations - brier score x eohi and cal_20251008160544.r delete mode 100644 .history/eohi1/correlations - brier score x eohi and cal_20251008160550.r delete mode 100644 .history/eohi1/correlations - brier score x eohi and cal_20251008162952.r delete mode 100644 .history/eohi1/correlations - brier score x eohi and cal_20251008185636.r delete mode 100644 .history/eohi1/correlations - brier score x eohi and cal_20251008185645.r delete mode 100644 .history/eohi1/correlations - brier score x eohi and cal_20251008185658.r delete mode 100644 .history/eohi1/correlations - eohi x calibration_20250915134710.r delete mode 100644 .history/eohi1/correlations - eohi x calibration_20250915134720.r delete mode 100644 .history/eohi1/correlations - eohi x calibration_20250915134733.r delete mode 100644 .history/eohi1/correlations - eohi x calibration_20250915134827.r delete mode 100644 .history/eohi1/correlations - eohi x calibration_20250915134916.r delete mode 100644 .history/eohi1/correlations - eohi x calibration_20250915142201.r delete mode 100644 .history/eohi1/correlations - eohi x calibration_20250915142559.r delete mode 100644 .history/eohi1/correlations - eohi x calibration_20250915142603.r delete mode 100644 .history/eohi1/correlations - eohi x calibration_20250916091406.r delete mode 100644 .history/eohi1/correlations - eohi x calibration_20250916091630.r delete mode 100644 .history/eohi1/correlations - eohi x calibration_20250916092909.r delete mode 100644 .history/eohi1/correlations - eohi x calibration_20250916092913.r delete mode 100644 .history/eohi1/correlations - eohi x calibration_20250916092918.r delete mode 100644 .history/eohi1/correlations - eohi x calibration_20250916092925.r delete mode 100644 .history/eohi1/correlations - eohi x calibration_20250916092959.r delete mode 100644 .history/eohi1/correlations - eohi x calibration_20250916093530.r delete mode 100644 .history/eohi1/correlations - eohi x calibration_20250916095303.r delete mode 100644 .history/eohi1/correlations - eohi x calibration_20250916112614.r delete mode 100644 .history/eohi1/correlations - eohi x calibration_20250916112923.r delete mode 100644 .history/eohi1/correlations - eohi x calibration_20250916113008.r delete mode 100644 .history/eohi1/correlations - eohi x calibration_20250929153154.r delete mode 100644 .history/eohi1/correlations - scales_20251007231341.r delete mode 100644 .history/eohi1/correlations - scales_20251007232519.r delete mode 100644 .history/eohi1/correlations - scales_20251007232812.r delete mode 100644 .history/eohi1/correlations - scales_20251007233020.r delete mode 100644 .history/eohi1/correlations - scales_20251007233026.r delete mode 100644 .history/eohi1/correlations - scales_20251007233050.r delete mode 100644 .history/eohi1/correlations - scales_20251007233059.r delete mode 100644 .history/eohi1/correlations - scales_20251007233106.r delete mode 100644 .history/eohi1/correlations - scales_20251007233151.r delete mode 100644 .history/eohi1/correlations - scales_20251007233159.r delete mode 100644 .history/eohi1/correlations - scales_20251007233210.r delete mode 100644 .history/eohi1/correlations - scales_20251007233219.r delete mode 100644 .history/eohi1/correlations - scales_20251007233224.r delete mode 100644 .history/eohi1/correlations - scales_20251007233307.r delete mode 100644 .history/eohi1/correlations - scales_20251007233311.r delete mode 100644 .history/eohi1/correlations - scales_20251007233319.r delete mode 100644 .history/eohi1/correlations - scales_20251007233424.r delete mode 100644 .history/eohi1/correlations - scales_20251007233541.r delete mode 100644 .history/eohi1/correlations - scales_20251007233548.r delete mode 100644 .history/eohi1/correlations - scales_20251007233652.r delete mode 100644 .history/eohi1/correlations - scales_20251007233731.r delete mode 100644 .history/eohi1/correlations - scales_20251007233734.r delete mode 100644 .history/eohi1/correlations - scales_20251007233736.r delete mode 100644 .history/eohi1/correlations - scales_20251007233739.r delete mode 100644 .history/eohi1/correlations - scales_20251007233744.r delete mode 100644 .history/eohi1/correlations - scales_20251007233749.r delete mode 100644 .history/eohi1/correlations - scales_20251007234152.r delete mode 100644 .history/eohi1/correlations - scales_20251007234202.r delete mode 100644 .history/eohi1/correlations - scales_20251007234339.r delete mode 100644 .history/eohi1/correlations - scales_20251007234406.r delete mode 100644 .history/eohi1/correlations - scales_20251008001047.r delete mode 100644 .history/eohi1/correlations - scales_20251008001054.r delete mode 100644 .history/eohi1/correlations - scales_20251008001102.r delete mode 100644 .history/eohi1/correlations - scales_20251008005438.r delete mode 100644 .history/eohi1/correlations - scales_20251008154804.r delete mode 100644 .history/eohi1/correlations - scales_20251008154934.r delete mode 100644 .history/eohi1/correlations - scales_20251008154940.r delete mode 100644 .history/eohi1/correlations - scales_20251008154947.r delete mode 100644 .history/eohi1/correlations - scales_20251008154952.r delete mode 100644 .history/eohi1/correlations - scales_20251008155000.r delete mode 100644 .history/eohi1/correlations - scales_20251008155103.r delete mode 100644 .history/eohi1/correlations - scales_20251008155139.r delete mode 100644 .history/eohi1/correlations - scales_20251008155218.r delete mode 100644 .history/eohi1/correlations - scales_20251008155221.r delete mode 100644 .history/eohi1/correlations - scales_20251008155223.r delete mode 100644 .history/eohi1/correlations - scales_20251008155225.r delete mode 100644 .history/eohi1/correlations - scales_20251008171710.r delete mode 100644 .history/eohi1/dataP 02 - cor means average over time frames_20251008000048.r delete mode 100644 .history/eohi1/dataP 02 - cor means average over time frames_20251008000055.r delete mode 100644 .history/eohi1/dataP 02 - cor means average over time frames_20251008000150.r delete mode 100644 .history/eohi1/dataP 02 - cor means average over time frames_20251008000158.r delete mode 100644 .history/eohi1/dataP 02 - cor means average over time frames_20251008000203.r delete mode 100644 .history/eohi1/dataP 02 - cor means average over time frames_20251008000212.r delete mode 100644 .history/eohi1/dataP 02 - cor means average over time frames_20251008000542.r delete mode 100644 .history/eohi1/dataP 02 - cor means average over time frames_20251008000547.r delete mode 100644 .history/eohi1/dataP 02 - cor means average over time frames_20251008000552.r delete mode 100644 .history/eohi1/dataP 02 - cor means average over time frames_20251008000600.r delete mode 100644 .history/eohi1/dataP 02 - cor means average over time frames_20251008000956.r delete mode 100644 .history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008152448.r delete mode 100644 .history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008152542.r delete mode 100644 .history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008153309.r delete mode 100644 .history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008153323.r delete mode 100644 .history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008153333.r delete mode 100644 .history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008153816.r delete mode 100644 .history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008162958.r delete mode 100644 .history/eohi1/datap 04 - CORRECT ehi var means_20251008153939.txt delete mode 100644 .history/eohi1/datap 04 - CORRECT ehi var means_20251008153946.txt delete mode 100644 .history/eohi1/datap 04 - CORRECT ehi var means_20251008153958.r delete mode 100644 .history/eohi1/datap 04 - CORRECT ehi var means_20251008153959.r delete mode 100644 .history/eohi1/datap 04 - CORRECT ehi var means_20251008154513.r delete mode 100644 .history/eohi1/datap 04 - CORRECT ehi var means_20251008154517.r delete mode 100644 .history/eohi1/datap 04 - CORRECT ehi var means_20251008154531.r delete mode 100644 .history/eohi1/datap 04 - CORRECT ehi var means_20251008162957.r delete mode 100644 .history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113023.r delete mode 100644 .history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113024.r delete mode 100644 .history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113338.r delete mode 100644 .history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113347.r delete mode 100644 .history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113517.r delete mode 100644 .history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113654.r delete mode 100644 .history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113658.r delete mode 100644 .history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113726.r delete mode 100644 .history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113730.r delete mode 100644 .history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113735.r delete mode 100644 .history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113937.r delete mode 100644 .history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113941.r delete mode 100644 .history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113947.r delete mode 100644 .history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027114131.r delete mode 100644 .history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115643.r delete mode 100644 .history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115647.r delete mode 100644 .history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115649.r delete mode 100644 .history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115718.r delete mode 100644 .history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115829.r delete mode 100644 .history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115832.r delete mode 100644 .history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115834.r delete mode 100644 .history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115845.r delete mode 100644 .history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027134607.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918115552.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918115553.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918115703.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918120055.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918120100.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918120102.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918120515.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918120600.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918120656.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918120727.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918122358.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918122401.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918122413.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918122634.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918122637.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918122638.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918123114.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918123117.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918123133.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918124915.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918145603.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918145606.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918145728.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918155602.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918155604.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918155605.r delete mode 100644 .history/eohi1/descriptives - gen knowledge questions_20250918155636.r delete mode 100644 .history/eohi1/e1 - reliability ehi_20251029093310.r delete mode 100644 .history/eohi1/e1 - reliability ehi_20251029093311.r delete mode 100644 .history/eohi1/e1 - reliability ehi_20251029093545.r delete mode 100644 .history/eohi1/e1 - reliability ehi_20251029094220.r delete mode 100644 .history/eohi1/e1 - reliability ehi_20251029094235.r delete mode 100644 .history/eohi1/e1 - reliability ehi_20251029094336.r delete mode 100644 .history/eohi1/e1 - reliability ehi_20251029094344.r delete mode 100644 .history/eohi1/e1 - reliability ehi_20251029094408.r delete mode 100644 .history/eohi1/minimal_test_20251004194428.rmd delete mode 100644 .history/eohi1/minimal_test_20251004194431.rmd delete mode 100644 .history/eohi1/minimal_test_20251004194608.rmd delete mode 100644 .history/eohi1/minimal_test_20251004194638.rmd delete mode 100644 .history/eohi1/mixed anova - DGEN_20251003132154.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251003132235.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251003132528.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251003132534.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251003132751.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006125959.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006150203.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006150325.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006150338.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006150343.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006150351.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006150356.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006150414.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006150433.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006150451.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006150515.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006151447.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006151454.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006151507.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006151514.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006151518.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006152215.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006152226.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006152312.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006152851.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006152926.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006152934.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006153823.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006153831.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006153845.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006154345.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006154352.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006154403.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006155006.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006155018.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006155049.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006155243.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006155250.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006155253.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006155346.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006155349.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006155353.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006155709.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006172448.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006172451.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006172503.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006172600.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006172604.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006172636.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006181939.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006191125.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006200121.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006200138.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006200320.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006201108.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251006201257.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251007162848.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251007162926.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251007162939.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251007162949.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251007162951.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251007180951.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251010152730.r delete mode 100644 .history/eohi1/mixed anova - DGEN_20251010165036.r delete mode 100644 .history/eohi1/mixed anova - domain means SIMPLE_20251003125947.r delete mode 100644 .history/eohi1/mixed anova - domain means SIMPLE_20251003130013.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912153102.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912153232.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912153241.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912153323.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912153326.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912153327.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912153352.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912153354.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912153402.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912154157.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912154200.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912154202.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912154303.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912154304.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912154305.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912154904.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912154909.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912155010.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912155017.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912155100.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912155158.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912155205.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912155217.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912155223.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912155224.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912155246.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912155247.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912155253.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912155329.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912155333.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912155338.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912155352.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912155830.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912155832.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912155837.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912155915.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912155919.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912155922.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912155944.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912155948.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912155953.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912160029.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912160033.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912160053.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912160118.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912160125.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912160131.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912160133.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912160206.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912160213.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912160217.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912160416.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912160420.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912160432.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912160551.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912160604.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912160607.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912160610.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912160644.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912160648.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912160652.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912160655.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912161036.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912161041.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912161046.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912161558.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912161610.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912161617.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912161619.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912161932.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912161937.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912162001.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912162009.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912162037.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912162116.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912162139.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912162145.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912162151.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912162247.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912162250.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912162255.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912162832.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912162837.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912162851.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912162858.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912163014.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250912164147.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915110342.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915110402.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915110435.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915110448.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915110457.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915110504.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915110508.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915110512.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915110518.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915110521.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915110535.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915110539.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915110547.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915111101.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915111110.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915111114.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915111252.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915111308.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915111314.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915112435.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915112521.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915112528.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915112533.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915112607.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915112612.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915112617.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915113543.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915113742.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915113749.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915114518.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915114619.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915114727.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915114729.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915114731.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915114817.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915115032.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915120001.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915120010.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915120029.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915120039.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915120050.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915120848.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915120855.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915120900.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915121033.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915121043.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915121049.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915121051.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915121136.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915121141.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915121152.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915121201.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915121212.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915121317.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122220.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122231.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122243.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122255.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122301.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122305.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122307.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122312.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122315.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122318.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122328.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122355.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122357.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122438.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122442.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122447.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122455.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122501.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122534.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122540.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122551.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122806.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122811.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122819.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122909.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122915.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915122919.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915132646.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915133137.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915133143.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915133150.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915133657.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915133739.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250915133916.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250916095306.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250916100623.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250916100629.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250916100739.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250916104746.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250916104757.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250916104830.r delete mode 100644 .history/eohi1/mixed anova - domain means_20250917120959.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251001163157.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251001171405.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251001171456.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251001171513.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251001171529.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251001171548.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251001171605.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251001171616.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251001171654.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251001171824.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251001171840.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251001172039.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251001174736.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251001174748.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251001174749.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003122510.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003122522.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003122534.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003122601.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003122606.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003122621.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003122630.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003122640.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003123628.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003123637.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003124045.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003124352.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003124400.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003124418.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003124429.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003124453.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003124504.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003124512.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003124527.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003124539.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003124544.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003124545.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003124614.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003124632.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003124639.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003124643.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003124942.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003124948.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003124952.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003125302.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003125335.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003130013.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003130111.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003134651.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003134713.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003134732.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003134803.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003134813.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003134814.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003134857.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003135506.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003135510.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003135514.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003140001.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003140127.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003140135.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003140812.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003140821.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003141902.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003141926.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003142537.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003142630.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003142635.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003142645.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003142752.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003142806.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003142812.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251003143103.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251004194541.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006125951.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006131233.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006131245.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006142529.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006142637.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006142646.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006142658.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006142703.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006145249.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006152026.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006152038.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006152054.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006153456.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006155838.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006155859.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006155915.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006155938.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006162736.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006162743.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006162748.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006162833.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006162839.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006162940.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006181905.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006181908.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006181914.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006181923.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006181934.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006181935.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006183415.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006223538.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006225002.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006225006.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006225025.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006225127.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006225134.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006225138.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006225148.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251006225428.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251007155303.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251007162057.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251007162101.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251007162111.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251007162115.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251007162122.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251007162130.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251007162138.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251007162843.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251007182023.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251007182033.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251007182343.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251007182949.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251007182951.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251007182953.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251007183630.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251007183634.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251007183638.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251007183824.r delete mode 100644 .history/eohi1/mixed anova - domain means_20251010145938.r delete mode 100644 .history/eohi1/mixed anova - personality_20250916123628.r delete mode 100644 .history/eohi1/mixed anova - personality_20250916123639.r delete mode 100644 .history/eohi1/mixed anova - personality_20250916123640.r delete mode 100644 .history/eohi1/mixed anova - personality_20250916125522.r delete mode 100644 .history/eohi1/mixed anova - personality_20250916130413.r delete mode 100644 .history/eohi1/mixed anova - personality_20250917121011.r delete mode 100644 .history/eohi1/mixed anova - preferences_20250916113624.r delete mode 100644 .history/eohi1/mixed anova - preferences_20250916113646.r delete mode 100644 .history/eohi1/mixed anova - preferences_20250916113702.r delete mode 100644 .history/eohi1/mixed anova - preferences_20250916113720.r delete mode 100644 .history/eohi1/mixed anova - preferences_20250916113743.r delete mode 100644 .history/eohi1/mixed anova - preferences_20250916113752.r delete mode 100644 .history/eohi1/mixed anova - preferences_20250916113803.r delete mode 100644 .history/eohi1/mixed anova - preferences_20250916113806.r delete mode 100644 .history/eohi1/mixed anova - preferences_20250916120325.r delete mode 100644 .history/eohi1/mixed anova - preferences_20250916120409.r delete mode 100644 .history/eohi1/mixed anova - preferences_20250916120419.r delete mode 100644 .history/eohi1/mixed anova - preferences_20250916120427.r delete mode 100644 .history/eohi1/mixed anova - preferences_20250916120436.r delete mode 100644 .history/eohi1/mixed anova - preferences_20250916120437.r delete mode 100644 .history/eohi1/mixed anova - preferences_20250916120515.r delete mode 100644 .history/eohi1/mixed anova - preferences_20250916120522.r delete mode 100644 .history/eohi1/mixed anova - preferences_20250916120534.r delete mode 100644 .history/eohi1/mixed anova - values_20250916125551.r delete mode 100644 .history/eohi1/mixed anova - values_20250916125552.r delete mode 100644 .history/eohi1/mixed anova - values_20250916125857.r delete mode 100644 .history/eohi1/mixed anova - values_20250916125907.r delete mode 100644 .history/eohi1/mixed anova - values_20250916125940.r delete mode 100644 .history/eohi1/readme_domain_mixed_anova_20251002121520.txt delete mode 100644 .history/eohi1/readme_domain_mixed_anova_20251002121755.txt delete mode 100644 .history/eohi1/regression e1 - edu x ehi_20251020100405.r delete mode 100644 .history/eohi1/regression e1 - edu x ehi_20251020100412.r delete mode 100644 .history/eohi1/regression e1 - edu x ehi_20251020100550.r delete mode 100644 .history/eohi1/regression e1 - edu x ehi_20251020100952.r delete mode 100644 .history/eohi1/regression e1 - edu x ehi_20251020101841.r delete mode 100644 .history/eohi1/regression e1 - edu x ehi_20251020103253.r delete mode 100644 .history/eohi1/regression e1 - edu x ehi_20251020103850.r delete mode 100644 .history/eohi1/regression e1 - edu x ehi_20251020113946.r delete mode 100644 .history/eohi1/regression e1 - edu x ehi_20251020133831.r delete mode 100644 .history/eohi1/regression e1 - edu x ehi_20251020134231.r delete mode 100644 .history/eohi1/regression e1 - edu x ehi_20251020173226.r delete mode 100644 .history/eohi1/regression e1 - edu x ehi_20251020173252.r delete mode 100644 .history/eohi1/regression e1 - edu x ehi_20251023140538.r delete mode 100644 .history/eohi1/regression e1 - ehi x sex x age_20251020173352.r delete mode 100644 .history/eohi1/regression e1 - ehi x sex x age_20251020173438.r delete mode 100644 .history/eohi1/regression e1 - ehi x sex x age_20251020174241.r delete mode 100644 .history/eohi1/regression e1 - ehi x sex x age_20251020174522.r delete mode 100644 .history/eohi1/regression e1 - ehi x sex x age_20251020175347.r delete mode 100644 .history/eohi1/regression e1 - ehi x sex x age_20251020180330.r delete mode 100644 .history/eohi1/regression e1 - ehi x sex x age_20251021102925.r delete mode 100644 .history/eohi1/regression e1 - ehi x sex x age_20251021104421.r delete mode 100644 .history/eohi1/regression e1 - ehi x sex x age_20251021111526.r delete mode 100644 .history/eohi1/regression e1 - ehi x sex x age_20251021120315.r delete mode 100644 .history/eohi1/regression e1 - ehi x sex x age_20251023105759.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251015154931.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016134509.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016142437.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016142502.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016142529.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016142552.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016142605.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016142612.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016142849.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016142956.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016143059.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016143109.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016143341.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016143415.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016143441.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016144540.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016144554.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016144652.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016145531.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016145539.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016145845.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016150135.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016150355.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016150436.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016150455.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016150502.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016150507.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016150527.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016150545.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016150558.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016154019.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016154024.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016154032.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016154037.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016154047.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016154106.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016154110.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016154202.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016154210.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016154217.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016154250.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016154501.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016154514.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016154524.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016154558.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016154609.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016154622.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016154628.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016154636.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016154647.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016154910.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016154925.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016154946.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016155002.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016155036.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016155057.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016155747.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016161118.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016161133.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016161154.qmd delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016173803.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016173806.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016173814.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016173817.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174209.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174256.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174304.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174308.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174332.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174341.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174344.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174349.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174357.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174402.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174430.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174433.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174436.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174439.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174444.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174451.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174501.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174512.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174519.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174521.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174524.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174526.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174528.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174532.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174537.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174542.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174545.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174548.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174550.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174557.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016174601.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016175238.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016175247.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016175258.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016175309.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016175319.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016175325.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016175340.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016175550.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016175653.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016175707.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016175808.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016175912.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016175924.r delete mode 100644 .history/eohi1/regressions e1 - assumptions_20251016175945.r delete mode 100644 .history/eohi1/reliability_analysis_cronbach_alpha_20250917154720.r delete mode 100644 .history/eohi1/reliability_analysis_cronbach_alpha_20250917154729.r delete mode 100644 .history/eohi1/reliability_analysis_cronbach_alpha_20250918120701.r delete mode 100644 .history/eohi1/test_knit_20251004194422.rmd delete mode 100644 .history/eohi1/test_knit_20251004194431.rmd delete mode 100644 .history/eohi1/test_knit_20251004194642.rmd delete mode 100644 .history/eohi2/README_Variable_Creation_20251001133606.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251001133614.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251001133615.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251001133634.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251001154337.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251001154405.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251001154412.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251001154419.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251001154424.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251001154430.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251001154444.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251001155104.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251001155126.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251008114335.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251008114354.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251008114414.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251008114419.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251008114428.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251008114444.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251008114508.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251008114531.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251008171443.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251008171453.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251008171510.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251008171520.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251008171528.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251008171541.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251008171604.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251008171626.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251008171628.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251029133334.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251029133342.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251029133348.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251029133355.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251029133422.txt delete mode 100644 .history/eohi2/README_Variable_Creation_20251029133433.txt delete mode 100644 .history/eohi2/RMD - mixed anova DGEN_20251003190744.rmd delete mode 100644 .history/eohi2/RMD - mixed anova DGEN_20251006125956.rmd delete mode 100644 .history/eohi2/correlation matrix 2 - std ehi_20251029124228.r delete mode 100644 .history/eohi2/correlation matrix 2 - std ehi_20251029124229.r delete mode 100644 .history/eohi2/correlation matrix 2 - std ehi_20251029124329.r delete mode 100644 .history/eohi2/correlations - domain general vars_20251008122234.r delete mode 100644 .history/eohi2/correlations - domain general vars_20251008122239.r delete mode 100644 .history/eohi2/correlations - domain general vars_20251008122254.r delete mode 100644 .history/eohi2/correlations - domain general vars_20251008122540.r delete mode 100644 .history/eohi2/correlations - domain general vars_20251008122543.r delete mode 100644 .history/eohi2/correlations - domain general vars_20251008122553.r delete mode 100644 .history/eohi2/correlations - domain general vars_20251008122555.r delete mode 100644 .history/eohi2/correlations - domain specific vars_20251008115022.r delete mode 100644 .history/eohi2/correlations - domain specific vars_20251008115035.r delete mode 100644 .history/eohi2/correlations - domain specific vars_20251008115036.r delete mode 100644 .history/eohi2/correlations - domain specific vars_20251008115149.r delete mode 100644 .history/eohi2/correlations - domain specific vars_20251008115152.r delete mode 100644 .history/eohi2/correlations - domain specific vars_20251008115154.r delete mode 100644 .history/eohi2/correlations - domain specific vars_20251008121216.r delete mode 100644 .history/eohi2/correlations CORRECT - ehi + DGEN x scales_20251008171931.r delete mode 100644 .history/eohi2/correlations CORRECT - ehi + DGEN x scales_20251008171942.r delete mode 100644 .history/eohi2/correlations CORRECT - ehi + DGEN x scales_20251008171945.r delete mode 100644 .history/eohi2/correlations CORRECT - ehi + DGEN x scales_20251008172011.r delete mode 100644 .history/eohi2/correlations CORRECT - ehi + DGEN x scales_20251008172056.r delete mode 100644 .history/eohi2/correlations CORRECT - ehi + DGEN x scales_20251008185510.r delete mode 100644 .history/eohi2/dataP - DGEN means_20251001122522.r delete mode 100644 .history/eohi2/dataP - DGEN means_20251001122534.r delete mode 100644 .history/eohi2/dataP - DGEN means_20251001122539.r delete mode 100644 .history/eohi2/dataP - DGEN means_20251001124312.r delete mode 100644 .history/eohi2/dataP - DGEN means_20251001130521.r delete mode 100644 .history/eohi2/dataP - recode DGEN vars_20251001100032.r delete mode 100644 .history/eohi2/dataP - recode DGEN vars_20251001100044.r delete mode 100644 .history/eohi2/dataP - recode DGEN vars_20251001100055.r delete mode 100644 .history/eohi2/dataP - recode DGEN vars_20251001100142.r delete mode 100644 .history/eohi2/dataP - recode DGEN vars_20251001100344.r delete mode 100644 .history/eohi2/dataP - recode DGEN vars_20251001100345.r delete mode 100644 .history/eohi2/dataP - recode DGEN vars_20251001100404.r delete mode 100644 .history/eohi2/dataP - recode DGEN vars_20251001100408.r delete mode 100644 .history/eohi2/dataP - recode DGEN vars_20251001100532.r delete mode 100644 .history/eohi2/dataP - recode DGEN vars_20251001100534.r delete mode 100644 .history/eohi2/dataP - recode DGEN vars_20251001100537.r delete mode 100644 .history/eohi2/dataP - recode DGEN vars_20251001100544.r delete mode 100644 .history/eohi2/dataP - recode DGEN vars_20251001100547.r delete mode 100644 .history/eohi2/dataP - recode DGEN vars_20251001100624.r delete mode 100644 .history/eohi2/dataP - recode DGEN vars_20251001105736.r delete mode 100644 .history/eohi2/dataP - recode present VARS_20251001110617.r delete mode 100644 .history/eohi2/dataP - recode present VARS_20251001110629.r delete mode 100644 .history/eohi2/dataP - recode present VARS_20251001110731.r delete mode 100644 .history/eohi2/dataP - recode present VARS_20251001110926.r delete mode 100644 .history/eohi2/dataP - recode present VARS_20251001110934.r delete mode 100644 .history/eohi2/dataP - recode present VARS_20251001110936.r delete mode 100644 .history/eohi2/dataP - recode present VARS_20251001111008.r delete mode 100644 .history/eohi2/dataP - recode present VARS_20251001111014.r delete mode 100644 .history/eohi2/dataP - recode present VARS_20251001111106.r delete mode 100644 .history/eohi2/dataP - recode present VARS_20251001112101.r delete mode 100644 .history/eohi2/dataP - recode scales VARS_20251001113407.r delete mode 100644 .history/eohi2/dataP - recode scales VARS_20251001113436.r delete mode 100644 .history/eohi2/dataP - recode scales VARS_20251001114903.r delete mode 100644 .history/eohi2/dataP - recode scales VARS_20251001114914.r delete mode 100644 .history/eohi2/dataP - recode scales VARS_20251001115233.r delete mode 100644 .history/eohi2/dataP - recode scales VARS_20251001120138.r delete mode 100644 .history/eohi2/dataP - recode scales VARS_20251001120154.r delete mode 100644 .history/eohi2/dataP - recode scales VARS_20251001120204.r delete mode 100644 .history/eohi2/dataP - recode scales VARS_20251001120217.r delete mode 100644 .history/eohi2/dataP - recode scales VARS_20251001120228.r delete mode 100644 .history/eohi2/dataP - recode scales VARS_20251001121501.r delete mode 100644 .history/eohi2/dataP - time interval differences_20251001130451.r delete mode 100644 .history/eohi2/dataP - time interval differences_20251001130503.r delete mode 100644 .history/eohi2/dataP - time interval differences_20251001130516.r delete mode 100644 .history/eohi2/dataP - time interval differences_20251001130613.r delete mode 100644 .history/eohi2/dataP - time interval differences_20251001130619.r delete mode 100644 .history/eohi2/dataP - time interval differences_20251001130649.r delete mode 100644 .history/eohi2/dataP - time interval differences_20251001131102.r delete mode 100644 .history/eohi2/dataP - time interval differences_20251001131108.r delete mode 100644 .history/eohi2/dataP - time interval differences_20251001131115.r delete mode 100644 .history/eohi2/dataP - time interval differences_20251001131121.r delete mode 100644 .history/eohi2/dataP - time interval differences_20251001131239.r delete mode 100644 .history/eohi2/dataP - time interval differences_20251001131304.r delete mode 100644 .history/eohi2/dataP - time interval differences_20251001131331.r delete mode 100644 .history/eohi2/dataP - time interval differences_20251001131423.r delete mode 100644 .history/eohi2/dataP 06 - time interval differences_20251001131422.r delete mode 100644 .history/eohi2/dataP 06 - time interval differences_20251001132410.r delete mode 100644 .history/eohi2/dataP 06 - time interval differences_20251001132459.r delete mode 100644 .history/eohi2/dataP 06 - time interval differences_20251001132504.r delete mode 100644 .history/eohi2/dataP 06 - time interval differences_20251001132511.r delete mode 100644 .history/eohi2/dataP 06 - time interval differences_20251001132516.r delete mode 100644 .history/eohi2/dataP 06 - time interval differences_20251001132521.r delete mode 100644 .history/eohi2/dataP 06 - time interval differences_20251001132530.r delete mode 100644 .history/eohi2/dataP 06 - time interval differences_20251001132540.r delete mode 100644 .history/eohi2/dataP 06 - time interval differences_20251001132559.r delete mode 100644 .history/eohi2/dataP 06 - time interval differences_20251001132623.r delete mode 100644 .history/eohi2/dataP 06 - time interval differences_20251001132907.r delete mode 100644 .history/eohi2/dataP 07 - domain means_20251001152954.r delete mode 100644 .history/eohi2/dataP 07 - domain means_20251001153004.r delete mode 100644 .history/eohi2/dataP 07 - domain means_20251001154326.r delete mode 100644 .history/eohi2/dataP 07 - domain means_20251001154444.r delete mode 100644 .history/eohi2/dataP 07 - domain means_20251001155057.r delete mode 100644 .history/eohi2/dataP 07 - domain means_20251001162547.r delete mode 100644 .history/eohi2/dataP 07 - domain means_20251001163148.r delete mode 100644 .history/eohi2/dataP 07 - domain means_20251008193329.r delete mode 100644 .history/eohi2/dataP 07 - domain means_20251008193335.r delete mode 100644 .history/eohi2/dataP 07 - domain means_20251008193341.r delete mode 100644 .history/eohi2/dataP 07 - domain means_20251008193347.r delete mode 100644 .history/eohi2/dataP 07 - domain means_20251008193352.r delete mode 100644 .history/eohi2/dataP 07 - domain means_20251008193557.r delete mode 100644 .history/eohi2/dataP 07 - domain means_20251008193600.r delete mode 100644 .history/eohi2/dataP 08 - DGEN 510 vars_20251006194349.r delete mode 100644 .history/eohi2/dataP 08 - DGEN 510 vars_20251006194411.r delete mode 100644 .history/eohi2/dataP 08 - DGEN 510 vars_20251006194451.r delete mode 100644 .history/eohi2/dataP 08 - DGEN 510 vars_20251006195055.r delete mode 100644 .history/eohi2/dataP 08 - DGEN 510 vars_20251006195109.r delete mode 100644 .history/eohi2/dataP 08 - DGEN 510 vars_20251006195118.r delete mode 100644 .history/eohi2/dataP 08 - DGEN 510 vars_20251006195128.r delete mode 100644 .history/eohi2/dataP 08 - DGEN 510 vars_20251006195318.r delete mode 100644 .history/eohi2/dataP 09 - interval x direction means_20251008113501.r delete mode 100644 .history/eohi2/dataP 09 - interval x direction means_20251008113518.r delete mode 100644 .history/eohi2/dataP 09 - interval x direction means_20251008113613.r delete mode 100644 .history/eohi2/dataP 10 - DGEN mean vars_20251008121818.r delete mode 100644 .history/eohi2/dataP 10 - DGEN mean vars_20251008121822.r delete mode 100644 .history/eohi2/dataP 10 - DGEN mean vars_20251008121849.r delete mode 100644 .history/eohi2/dataP 11 - CORRECT ehi vars_20251008152253.r delete mode 100644 .history/eohi2/dataP 11 - CORRECT ehi vars_20251008163033.r delete mode 100644 .history/eohi2/dataP 11 - CORRECT ehi vars_20251008163045.r delete mode 100644 .history/eohi2/dataP 11 - CORRECT ehi vars_20251008163815.r delete mode 100644 .history/eohi2/dataP 11 - CORRECT ehi vars_20251008163817.r delete mode 100644 .history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164446.r delete mode 100644 .history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164447.r delete mode 100644 .history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164616.r delete mode 100644 .history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164646.r delete mode 100644 .history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164712.r delete mode 100644 .history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164735.r delete mode 100644 .history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164934.r delete mode 100644 .history/eohi2/datap 13 - ehi domain specific means_20251008165421.r delete mode 100644 .history/eohi2/datap 13 - ehi domain specific means_20251008165443.r delete mode 100644 .history/eohi2/datap 13 - ehi domain specific means_20251008165448.r delete mode 100644 .history/eohi2/datap 13 - ehi domain specific means_20251008165551.r delete mode 100644 .history/eohi2/datap 13 - ehi domain specific means_20251008170838.r delete mode 100644 .history/eohi2/datap 14 - all ehi global means_20251008171057.r delete mode 100644 .history/eohi2/datap 14 - all ehi global means_20251008171120.r delete mode 100644 .history/eohi2/datap 14 - all ehi global means_20251008171136.r delete mode 100644 .history/eohi2/datap 14 - all ehi global means_20251008171157.r delete mode 100644 .history/eohi2/datap 14 - all ehi global means_20251008171250.r delete mode 100644 .history/eohi2/datap 15 - education recoded ordinal 3_20251027135156.r delete mode 100644 .history/eohi2/datap 15 - education recoded ordinal 3_20251027135157.r delete mode 100644 .history/eohi2/datap 15 - education recoded ordinal 3_20251027141418.r delete mode 100644 .history/eohi2/datap 15 - education recoded ordinal 3_20251027143845.r delete mode 100644 .history/eohi2/datap 16 - ehi vars standardized _20251029120227.r delete mode 100644 .history/eohi2/datap 16 - ehi vars standardized _20251029120228.r delete mode 100644 .history/eohi2/datap 16 - ehi vars standardized _20251029120234.r delete mode 100644 .history/eohi2/datap 16 - ehi vars standardized _20251029121728.r delete mode 100644 .history/eohi2/datap 16 - ehi vars standardized _20251029122336.r delete mode 100644 .history/eohi2/datap 16 - ehi vars standardized _20251029124145.r delete mode 100644 .history/eohi2/e2 - correlation matrix_20251027143921.r delete mode 100644 .history/eohi2/e2 - correlation matrix_20251027143922.r delete mode 100644 .history/eohi2/e2 - correlation matrix_20251027144718.r delete mode 100644 .history/eohi2/e2 - correlation matrix_20251027145122.r delete mode 100644 .history/eohi2/e2 - correlation matrix_20251027145125.r delete mode 100644 .history/eohi2/e2 - correlation matrix_20251027145133.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003144019.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003144020.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003150009.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003150038.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003150047.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003150106.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003150128.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003150130.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003150137.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003150143.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003150144.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003150214.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003150251.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003150313.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003150330.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003150337.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003150346.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003152314.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003170643.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003170646.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003170651.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003170811.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003171114.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003171117.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003171121.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251003171149.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006125954.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006191142.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006192531.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006192540.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006192548.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006192554.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006192606.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006192619.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006192629.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006192639.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006192701.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006192716.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006192735.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006192745.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006192802.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006192817.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006192843.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006192917.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006192940.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006193221.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006193234.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006193243.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006193252.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006193310.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006193311.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006193509.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006193537.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006193540.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006195204.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006195215.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006195225.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006195236.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006195256.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006195344.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006195408.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006195422.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006195559.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006200505.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006200514.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006200516.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006201016.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006201021.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006201023.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006201057.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006230411.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006231257.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006231308.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006231314.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006231325.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006231533.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006231538.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006232531.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006232538.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251006232540.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251007103206.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251007103213.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251007103739.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251007104105.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251007104111.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251007104119.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251007105736.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251007185029.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251007185541.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251007192720.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251008190007.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251008190301.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251008190307.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251010141129.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251010160100.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251010165028.r delete mode 100644 .history/eohi2/mixed anova - DGEN_20251010165032.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003143914.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003143942.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003145806.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003145820.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003145845.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003145918.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003145949.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003145955.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003150004.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003150017.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003150021.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003150023.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003150026.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003150029.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003150032.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003150034.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003150037.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003150049.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003150056.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003150107.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003150118.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003150155.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003150222.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003150238.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003150306.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003152345.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003152354.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003152532.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251003170438.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251006191145.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251006191927.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251006191941.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251006191952.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251006192942.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251006225442.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251007105958.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251007110011.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251007110320.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251007110333.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251007110641.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251007110654.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251007111158.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251007155225.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251007184341.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251007184520.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251007184536.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251007184540.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251007184820.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251007184823.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251007184824.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251007185023.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251007185544.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251008192902.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251008192910.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251008192916.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251008192921.r delete mode 100644 .history/eohi2/mixed anova - domain means_20251008192926.r delete mode 100644 .history/eohi2/recode_likert_items_20251001085552.r delete mode 100644 .history/eohi2/recode_likert_items_20251001085616.r delete mode 100644 .history/eohi2/recode_likert_items_20251001085904.r delete mode 100644 .history/eohi2/recode_likert_items_20251001085909.r delete mode 100644 .history/eohi2/recode_likert_items_20251001085917.r delete mode 100644 .history/eohi2/recode_likert_items_20251001090003.r delete mode 100644 .history/eohi2/recode_likert_items_20251001090613.r delete mode 100644 .history/eohi2/recode_likert_items_20251001090620.r delete mode 100644 .history/eohi2/recode_likert_items_20251001090829.r delete mode 100644 .history/eohi2/recode_likert_items_20251001090840.r delete mode 100644 .history/eohi2/recode_likert_items_20251001090919.r delete mode 100644 .history/eohi2/recode_likert_items_20251001091002.r delete mode 100644 .history/eohi2/recode_likert_items_20251001091011.r delete mode 100644 .history/eohi2/recode_likert_items_20251001091016.r delete mode 100644 .history/eohi2/recode_likert_items_20251001091544.r delete mode 100644 .history/eohi2/recode_likert_items_20251001091551.r delete mode 100644 .history/eohi2/recode_likert_items_20251001091602.r delete mode 100644 .history/eohi2/recode_likert_items_20251001091838.r delete mode 100644 .history/eohi2/recode_likert_items_20251001091844.r delete mode 100644 .history/eohi2/recode_likert_items_20251001091846.r delete mode 100644 .history/eohi2/recode_likert_items_20251001091852.r delete mode 100644 .history/eohi2/recode_likert_items_20251001092353.r delete mode 100644 .history/eohi2/recode_likert_items_20251001092409.r delete mode 100644 .history/eohi2/recode_likert_items_20251001092420.r delete mode 100644 .history/eohi2/recode_likert_items_20251001092430.r delete mode 100644 .history/eohi2/recode_likert_items_20251001094502.r delete mode 100644 .history/eohi2/recode_likert_items_20251001105906.r delete mode 100644 .history/eohi2/reliability - ehi_20251028173023.r delete mode 100644 .history/eohi2/reliability - ehi_20251028173024.r delete mode 100644 .history/eohi2/reliability - ehi_20251028173102.r delete mode 100644 .history/eohi2/reliability - ehi_20251028173106.r delete mode 100644 .history/eohi2/reliability - ehi_20251028173223.r delete mode 100644 .history/eohi2/reliability - ehi_20251028173226.r delete mode 100644 .history/eohi2/reliability - ehi_20251028173245.r delete mode 100644 .history/eohi2/reliability - ehi_20251028173247.r delete mode 100644 .history/eohi2/reliability - ehi_20251028173249.r delete mode 100644 .history/eohi2/reliability - ehi_20251028174139.r delete mode 100644 .history/eohi2/reliability - ehi_20251028174146.r delete mode 100644 .history/eohi2/reliability - ehi_20251028174151.r delete mode 100644 .history/eohi2/reliability - ehi_20251028174152.r delete mode 100644 .history/eohi2/reliability - ehi_20251028174209.r delete mode 100644 .history/eohi2/reliability - ehi_20251028174329.r delete mode 100644 .history/eohi2/reliability - ehi_20251028174333.r delete mode 100644 .history/eohi2/reliability - ehi_20251028174401.r delete mode 100644 .history/eohi2/reliability - ehi_20251028174407.r delete mode 100644 .history/eohi2/reliability - ehi_20251028174412.r delete mode 100644 .history/eohi2/reliability - ehi_20251028174501.r delete mode 100644 .history/eohi2/reliability - ehi_20251028174503.r delete mode 100644 .history/eohi2/reliability - ehi_20251028174701.r delete mode 100644 .history/eohi2/reliability - ehi_20251028174922.r delete mode 100644 .history/eohi2/reliability - ehi_20251028174926.r delete mode 100644 .history/eohi2/reliability - ehi_20251028174958.r delete mode 100644 .history/eohi2/reliability - ehi_20251028175001.r delete mode 100644 .history/eohi2/reliability - ehi_20251028175110.r delete mode 100644 .history/eohi2/reliability - ehi_20251028175138.r delete mode 100644 .history/eohi2/reliability - ehi_20251028175140.r delete mode 100644 .history/eohi2/reliability - ehi_20251028175150.r delete mode 100644 .history/eohi2/reliability - ehi_20251028175152.r delete mode 100644 .history/eohi2/reliability - ehi_20251028175211.r delete mode 100644 .history/eohi2/reliability - ehi_20251028175214.r delete mode 100644 .history/eohi2/reliability - ehi_20251028175223.r delete mode 100644 .history/eohi2/reliability - ehi_20251028175227.r delete mode 100644 .history/eohi2/reliability - ehi_20251028175230.r delete mode 100644 .history/eohi2/reliability - ehi_20251028175322.r delete mode 100644 .history/eohi2/reliability - ehi_20251028175530.r delete mode 100644 .history/eohi2/reliability - ehi_20251028175535.r delete mode 100644 .history/eohi2/reliability - ehi_20251028175559.r delete mode 100644 .history/eohi2/reliability - ehi_20251028175617.r delete mode 100644 .history/eohi2/reliability - ehi_20251028175618.r delete mode 100644 .history/eohi2/reliability - ehi_20251028175644.r delete mode 100644 .history/eohi2/reliability - ehi_20251028175648.r delete mode 100644 .history/eohi2/reliability - ehi_20251028175651.r delete mode 100644 .history/eohi2/reliability - ehi_20251028180009.r delete mode 100644 .history/eohi2/reliability - ehi_20251028180012.r delete mode 100644 .history/eohi2/reliability - ehi_20251028180015.r delete mode 100644 .history/eohi2/reliability - ehi_20251028180017.r delete mode 100644 .history/eohi2/reliability - ehi_20251028180125.r delete mode 100644 .history/eohi2/reliability - ehi_20251028180127.r delete mode 100644 .history/eohi2/reliability - ehi_20251028180136.r delete mode 100644 .history/eohi2/reliability analysis_20251027165337.r delete mode 100644 .history/eohi2/reliability analysis_20251027165338.r delete mode 100644 .history/eohi2/reliability analysis_20251028141004.r delete mode 100644 .history/eohi2/reliability analysis_20251028141504.r delete mode 100644 .history/eohi2/reliability analysis_20251028144222.r delete mode 100644 .history/eohi2/reliability analysis_20251028151252.r delete mode 100644 .history/eohi2/reliability analysis_20251028151259.r delete mode 100644 .history/eohi2/reliability analysis_20251028151305.r delete mode 100644 .history/eohi2/reliability analysis_20251028151323.r delete mode 100644 .history/eohi2/reliability analysis_20251028151326.r delete mode 100644 .history/eohi2/reliability analysis_20251028151330.r delete mode 100644 .history/eohi2/reliability analysis_20251028151333.r delete mode 100644 .history/eohi2/reliability analysis_20251028151339.r delete mode 100644 .history/eohi2/reliability analysis_20251028151358.r delete mode 100644 .history/eohi2/reliability analysis_20251028151540.r delete mode 100644 .history/eohi2/reliability analysis_20251028151544.r delete mode 100644 .history/eohi2/reliability analysis_20251028151808.r delete mode 100644 .history/eohi2/reliability analysis_20251028151819.r delete mode 100644 .history/eohi2/reliability analysis_20251028151900.r delete mode 100644 .history/eohi2/reliability analysis_20251028151917.r delete mode 100644 .history/eohi2/reliability analysis_20251028151922.r delete mode 100644 .history/eohi2/reliability analysis_20251028151925.r delete mode 100644 .history/eohi2/reliability analysis_20251028152017.r delete mode 100644 .history/eohi2/reliability analysis_20251028152020.r delete mode 100644 .history/eohi2/reliability analysis_20251028152033.r delete mode 100644 .history/eohi2/reliability analysis_20251028152127.r delete mode 100644 .history/eohi2/reliability analysis_20251028152132.r delete mode 100644 .history/eohi2/reliability analysis_20251028152134.r delete mode 100644 .history/eohi2/reliability analysis_20251028152156.r delete mode 100644 .history/eohi2/reliability analysis_20251028161222.r delete mode 100644 .history/eohi2/reliability analysis_20251028161229.r delete mode 100644 .history/eohi2/reliability analysis_20251028161236.r delete mode 100644 .history/eohi2/reliability analysis_20251028161240.r delete mode 100644 .history/eohi2/reliability analysis_20251028161242.r delete mode 100644 .history/eohi2/reliability analysis_20251028161247.r delete mode 100644 .history/eohi2/reliability analysis_20251028161950.r delete mode 100644 .history/eohi2/reliability analysis_20251028162118.r delete mode 100644 .history/eohi2/reliability analysis_20251028162136.r delete mode 100644 .history/eohi2/reliability analysis_20251028162145.r delete mode 100644 .history/eohi2/reliability analysis_20251028162201.r delete mode 100644 .history/eohi2/reliability analysis_20251028162240.r delete mode 100644 .history/eohi2/reliability analysis_20251028162243.r delete mode 100644 .history/eohi2/reliability analysis_20251028162251.r delete mode 100644 .history/eohi2/reliability analysis_20251028162450.r delete mode 100644 .history/eohi2/reliability analysis_20251028162455.r delete mode 100644 .history/eohi2/reliability analysis_20251028162504.r delete mode 100644 .history/eohi2/reliability analysis_20251028162602.r delete mode 100644 .history/eohi2/reliability analysis_20251028162606.r delete mode 100644 .history/eohi2/reliability analysis_20251028162608.r delete mode 100644 .history/eohi2/reliability analysis_20251028162820.r delete mode 100644 .history/eohi2/reliability analysis_20251028162852.r delete mode 100644 .history/eohi2/reliability analysis_20251028162935.r delete mode 100644 .history/eohi2/reliability analysis_20251028162951.r delete mode 100644 .history/eohi2/reliability analysis_20251028162955.r delete mode 100644 .history/eohi2/reliability analysis_20251028163006.r delete mode 100644 .history/eohi2/reliability analysis_20251028163158.r delete mode 100644 .history/eohi2/reliability analysis_20251028163211.r delete mode 100644 .history/eohi2/reliability analysis_20251028163215.r delete mode 100644 .history/eohi2/reliability analysis_20251028163222.r delete mode 100644 .history/eohi2/reliability analysis_20251028163310.r delete mode 100644 .history/eohi2/reliability analysis_20251028163331.r delete mode 100644 .history/eohi2/reliability analysis_20251028163339.r delete mode 100644 .history/eohi2/reliability analysis_20251028163348.r delete mode 100644 .history/eohi2/reliability analysis_20251028163622.r delete mode 100644 .history/eohi2/reliability analysis_20251028164143.r delete mode 100644 .history/eohi2/reliability analysis_20251028164153.r delete mode 100644 .history/eohi2/reliability analysis_20251028164158.r delete mode 100644 .history/eohi2/reliability analysis_20251028164324.r delete mode 100644 .history/eohi2/reliability analysis_20251028164342.r delete mode 100644 .history/eohi2/reliability analysis_20251028164345.r delete mode 100644 .history/eohi2/reliability analysis_20251028164350.r delete mode 100644 .history/eohi2/reliability analysis_20251028164357.r delete mode 100644 .history/eohi2/reliability analysis_20251028164422.r delete mode 100644 .history/eohi2/reliability analysis_20251028164426.r delete mode 100644 .history/eohi2/reliability analysis_20251028164514.r delete mode 100644 .history/eohi2/reliability analysis_20251028164546.r delete mode 100644 .history/eohi2/reliability analysis_20251028164551.r delete mode 100644 .history/eohi2/reliability analysis_20251028164558.r delete mode 100644 .history/eohi2/reliability analysis_20251028164641.r delete mode 100644 .history/eohi2/reliability analysis_20251028164650.r delete mode 100644 .history/eohi2/reliability analysis_20251028164657.r delete mode 100644 .history/eohi2/reliability analysis_20251028164813.r delete mode 100644 .history/eohi2/reliability analysis_20251028164819.r delete mode 100644 .history/eohi2/reliability analysis_20251028164821.r delete mode 100644 .history/eohi2/reliability analysis_20251028165006.r delete mode 100644 .history/eohi2/reliability analysis_20251028165109.r delete mode 100644 .history/eohi2/reliability analysis_20251028165118.r delete mode 100644 .history/eohi2/reliability analysis_20251028165120.r delete mode 100644 .history/eohi2/reliability analysis_20251028165128.r delete mode 100644 .history/eohi2/reliability analysis_20251028165402.r delete mode 100644 .history/eohi2/reliability analysis_20251028165410.r delete mode 100644 .history/eohi2/reliability analysis_20251028165435.r delete mode 100644 .history/eohi2/reliability analysis_20251028170716.r delete mode 100644 .history/eohi2/reliability analysis_20251028170719.r delete mode 100644 .history/eohi2/reliability analysis_20251028170725.r delete mode 100644 .history/eohi2/reliability analysis_20251028170741.r delete mode 100644 .history/eohi2/reliability analysis_20251028170822.r delete mode 100644 .history/eohi2/reliability analysis_20251028170831.r delete mode 100644 .history/eohi2/reliability analysis_20251028170832.r delete mode 100644 .history/eohi2/reliability analysis_20251028170921.r delete mode 100644 .history/eohi2/reliability analysis_20251028170923.r delete mode 100644 .history/eohi2/reliability analysis_20251028170935.r delete mode 100644 .history/eohi2/reliability analysis_20251028171635.r delete mode 100644 .history/eohi2/reliability_summary_table_20251028144409.csv delete mode 100644 .history/eohi2/reliability_summary_table_20251028144438.csv delete mode 100644 .history/eohi2/reliability_summary_table_20251028173027.csv delete mode 100644 .history/eohi2/verify_means_20251008115109.R delete mode 100644 .history/mixed anova - domain means_20250912124308.r delete mode 100644 .history/mixed anova - domain means_20250912124317.r delete mode 100644 .history/mixed anova - domain means_20250912124407.r delete mode 100644 .history/mixed anova - domain means_20250912124620.r delete mode 100644 .history/mixed anova - domain means_20250912125000.r delete mode 100644 .history/mixed anova - domain means_20250912125003.r delete mode 100644 .history/mixed anova - domain means_20250912125007.r delete mode 100644 .history/mixed anova - domain means_20250912125012.r delete mode 100644 .history/mixed anova - domain means_20250912125017.r delete mode 100644 .history/mixed anova - domain means_20250912125031.r delete mode 100644 .history/mixed anova - domain means_20250912125040.r delete mode 100644 .history/mixed anova - domain means_20250912125046.r delete mode 100644 .history/mixed anova - domain means_20250912130804.r delete mode 100644 .history/mixed anova - domain means_20250912130809.r delete mode 100644 .history/mixed anova - domain means_20250912130812.r delete mode 100644 .history/mixed anova - domain means_20250912130822.r delete mode 100644 .history/mixed anova - domain means_20250912130828.r delete mode 100644 .history/mixed anova - domain means_20250912130829.r delete mode 100644 .history/mixed anova - domain means_20250912144754.r delete mode 100644 .history/mixed anova - domain means_20250912152948.r delete mode 100644 .history/mixed anova - domain means_20250912152953.r delete mode 100644 .history/mixed anova - domain means_20250912153103.r delete mode 100644 .history/mixed anova - ind item_20250912123133.r delete mode 100644 .history/mixed anova - ind item_20250912123134.r delete mode 100644 .history/mixed anova_20250912110917.r delete mode 100644 .history/mixed anova_20250912110922.r delete mode 100644 .history/mixed anova_20250912110938.r delete mode 100644 .history/mixed anova_20250912111710.r delete mode 100644 .history/mixed anova_20250912111725.r delete mode 100644 .history/mixed anova_20250912111750.r delete mode 100644 .history/mixed anova_20250912111819.r delete mode 100644 .history/mixed anova_20250912111827.r delete mode 100644 .history/mixed anova_20250912112005.r delete mode 100644 .history/mixed anova_20250912112141.r delete mode 100644 .history/mixed anova_20250912112319.r delete mode 100644 .history/mixed anova_20250912112327.r delete mode 100644 .history/mixed anova_20250912112334.r delete mode 100644 .history/mixed anova_20250912112620.r delete mode 100644 .history/mixed anova_20250912112624.r delete mode 100644 .history/mixed anova_20250912112628.r delete mode 100644 .history/mixed anova_20250912112631.r delete mode 100644 .history/mixed anova_20250912112632.r delete mode 100644 .history/mixed anova_20250912112635.r delete mode 100644 .history/mixed anova_20250912112638.r delete mode 100644 .history/mixed anova_20250912112641.r delete mode 100644 .history/mixed anova_20250912112645.r delete mode 100644 .history/mixed anova_20250912112651.r delete mode 100644 .history/mixed anova_20250912112700.r delete mode 100644 .history/mixed anova_20250912113047.r delete mode 100644 .history/mixed anova_20250912113055.r delete mode 100644 .history/mixed anova_20250912113119.r delete mode 100644 .history/mixed anova_20250912113345.r delete mode 100644 .history/mixed anova_20250912113353.r delete mode 100644 .history/mixed anova_20250912113448.r delete mode 100644 .history/mixed anova_20250912113455.r delete mode 100644 .history/mixed anova_20250912113502.r delete mode 100644 .history/mixed anova_20250912113654.r delete mode 100644 .history/mixed anova_20250912113659.r delete mode 100644 .history/mixed anova_20250912113707.r delete mode 100644 .history/mixed anova_20250912114051.r delete mode 100644 .history/mixed anova_20250912114100.r delete mode 100644 .history/mixed anova_20250912114105.r delete mode 100644 .history/mixed anova_20250912114140.r delete mode 100644 .history/mixed anova_20250912114154.r delete mode 100644 .history/mixed anova_20250912114156.r delete mode 100644 .history/mixed anova_20250912114707.r delete mode 100644 .history/mixed anova_20250912114714.r delete mode 100644 .history/mixed anova_20250912124604.r diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c138f46 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.history/ \ No newline at end of file diff --git a/.history/eohi1/BS_means_20250922131352.vb b/.history/eohi1/BS_means_20250922131352.vb deleted file mode 100644 index 3bd8303..0000000 --- a/.history/eohi1/BS_means_20250922131352.vb +++ /dev/null @@ -1,118 +0,0 @@ -Option Explicit - -Private Function GetColIndex(ByVal headerName As String, ByVal ws As Worksheet) As Long - Dim lastCol As Long - lastCol = ws.Cells(1, ws.Columns.Count).End(xlToLeft).Column - Dim m As Variant - m = Application.Match(headerName, ws.Range(ws.Cells(1, 1), ws.Cells(1, lastCol)), 0) - If IsError(m) Then - GetColIndex = 0 - Else - GetColIndex = CLng(m) - End If -End Function - -Private Function BuildPresentColArray(ByVal headers As Variant, ByVal ws As Worksheet) As Variant - Dim tmp() As Long - ReDim tmp(0 To UBound(headers)) - Dim i As Long, c As Long - c = 0 - For i = LBound(headers) To UBound(headers) - Dim colIdx As Long - colIdx = GetColIndex(CStr(headers(i)), ws) - If colIdx > 0 Then - tmp(c) = colIdx - c = c + 1 - End If - Next i - If c = 0 Then - BuildPresentColArray = Array() - Else - Dim outArr() As Long - ReDim outArr(0 To c - 1) - For i = 0 To c - 1 - outArr(i) = tmp(i) - Next i - BuildPresentColArray = outArr - End If -End Function - -Private Function MeanOfRow(ByVal ws As Worksheet, ByVal rowIndex As Long, ByVal colIndexes As Variant) As Variant - Dim i As Long - Dim sumVals As Double - Dim countVals As Long - sumVals = 0 - countVals = 0 - If IsArray(colIndexes) Then - For i = LBound(colIndexes) To UBound(colIndexes) - Dim v As Variant - v = ws.Cells(rowIndex, CLng(colIndexes(i))).Value - If Not IsError(v) Then - If IsNumeric(v) Then - sumVals = sumVals + CDbl(v) - countVals = countVals + 1 - End If - End If - Next i - End If - If countVals = 0 Then - MeanOfRow = CVErr(xlErrNA) - Else - MeanOfRow = sumVals / countVals - End If -End Function - -Private Function EnsureOutputColumn(ByVal ws As Worksheet, ByVal headerName As String) As Long - Dim c As Long - c = GetColIndex(headerName, ws) - If c = 0 Then - Dim lastCol As Long - lastCol = ws.Cells(1, ws.Columns.Count).End(xlToLeft).Column - c = lastCol + 1 - ws.Cells(1, c).Value = headerName - End If - EnsureOutputColumn = c -End Function - -Sub BS_Means() - Dim ws As Worksheet - Set ws = ThisWorkbook.Sheets(1) - - Dim all28 As Variant - all28 = Array( _ - "lock_T_easy", "hume_F_easy", "papy_T_easy", "sham_T_easy", "list_F_easy", "cons_T_easy", "tsun_T_easy", "pana_T_easy", "kabu_T_easy", "gulf_F_easy", "oedi_T_easy", "vaud_T_easy", "mont_F_easy", "demo_F_easy", _ - "spee_F_hard", "dwar_F_hard", "carb_T_hard", "bohr_T_hard", "gang_F_hard", "vitc_F_hard", "hert_F_hard", "pucc_F_hard", "troy_T_hard", "moza_F_hard", "croc_F_hard", "gees_F_hard", "lute_F_hard", "memo_F_hard" _ - ) - - Dim easy14 As Variant - easy14 = Array( _ - "lock_T_easy", "hume_F_easy", "papy_T_easy", "sham_T_easy", "list_F_easy", "cons_T_easy", "tsun_T_easy", "pana_T_easy", "kabu_T_easy", "gulf_F_easy", "oedi_T_easy", "vaud_T_easy", "mont_F_easy", "demo_F_easy" _ - ) - - Dim hard14 As Variant - hard14 = Array( _ - "spee_F_hard", "dwar_F_hard", "carb_T_hard", "bohr_T_hard", "gang_F_hard", "vitc_F_hard", "hert_F_hard", "pucc_F_hard", "troy_T_hard", "moza_F_hard", "croc_F_hard", "gees_F_hard", "lute_F_hard", "memo_F_hard" _ - ) - - Dim colsAll As Variant, colsEasy As Variant, colsHard As Variant - colsAll = BuildPresentColArray(all28, ws) - colsEasy = BuildPresentColArray(easy14, ws) - colsHard = BuildPresentColArray(hard14, ws) - - Dim colBS28 As Long, colBSEasy As Long, colBSHard As Long - colBS28 = EnsureOutputColumn(ws, "bs_28") - colBSEasy = EnsureOutputColumn(ws, "bs_easy") - colBSHard = EnsureOutputColumn(ws, "bs_hard") - - Dim lastRow As Long - lastRow = ws.Cells(ws.Rows.Count, "A").End(xlUp).Row - - Dim r As Long - For r = 2 To lastRow - ws.Cells(r, colBS28).Value = MeanOfRow(ws, r, colsAll) - ws.Cells(r, colBSEasy).Value = MeanOfRow(ws, r, colsEasy) - ws.Cells(r, colBSHard).Value = MeanOfRow(ws, r, colsHard) - Next r -End Sub - - diff --git a/.history/eohi1/BS_means_20250922131356.vb b/.history/eohi1/BS_means_20250922131356.vb deleted file mode 100644 index 3bd8303..0000000 --- a/.history/eohi1/BS_means_20250922131356.vb +++ /dev/null @@ -1,118 +0,0 @@ -Option Explicit - -Private Function GetColIndex(ByVal headerName As String, ByVal ws As Worksheet) As Long - Dim lastCol As Long - lastCol = ws.Cells(1, ws.Columns.Count).End(xlToLeft).Column - Dim m As Variant - m = Application.Match(headerName, ws.Range(ws.Cells(1, 1), ws.Cells(1, lastCol)), 0) - If IsError(m) Then - GetColIndex = 0 - Else - GetColIndex = CLng(m) - End If -End Function - -Private Function BuildPresentColArray(ByVal headers As Variant, ByVal ws As Worksheet) As Variant - Dim tmp() As Long - ReDim tmp(0 To UBound(headers)) - Dim i As Long, c As Long - c = 0 - For i = LBound(headers) To UBound(headers) - Dim colIdx As Long - colIdx = GetColIndex(CStr(headers(i)), ws) - If colIdx > 0 Then - tmp(c) = colIdx - c = c + 1 - End If - Next i - If c = 0 Then - BuildPresentColArray = Array() - Else - Dim outArr() As Long - ReDim outArr(0 To c - 1) - For i = 0 To c - 1 - outArr(i) = tmp(i) - Next i - BuildPresentColArray = outArr - End If -End Function - -Private Function MeanOfRow(ByVal ws As Worksheet, ByVal rowIndex As Long, ByVal colIndexes As Variant) As Variant - Dim i As Long - Dim sumVals As Double - Dim countVals As Long - sumVals = 0 - countVals = 0 - If IsArray(colIndexes) Then - For i = LBound(colIndexes) To UBound(colIndexes) - Dim v As Variant - v = ws.Cells(rowIndex, CLng(colIndexes(i))).Value - If Not IsError(v) Then - If IsNumeric(v) Then - sumVals = sumVals + CDbl(v) - countVals = countVals + 1 - End If - End If - Next i - End If - If countVals = 0 Then - MeanOfRow = CVErr(xlErrNA) - Else - MeanOfRow = sumVals / countVals - End If -End Function - -Private Function EnsureOutputColumn(ByVal ws As Worksheet, ByVal headerName As String) As Long - Dim c As Long - c = GetColIndex(headerName, ws) - If c = 0 Then - Dim lastCol As Long - lastCol = ws.Cells(1, ws.Columns.Count).End(xlToLeft).Column - c = lastCol + 1 - ws.Cells(1, c).Value = headerName - End If - EnsureOutputColumn = c -End Function - -Sub BS_Means() - Dim ws As Worksheet - Set ws = ThisWorkbook.Sheets(1) - - Dim all28 As Variant - all28 = Array( _ - "lock_T_easy", "hume_F_easy", "papy_T_easy", "sham_T_easy", "list_F_easy", "cons_T_easy", "tsun_T_easy", "pana_T_easy", "kabu_T_easy", "gulf_F_easy", "oedi_T_easy", "vaud_T_easy", "mont_F_easy", "demo_F_easy", _ - "spee_F_hard", "dwar_F_hard", "carb_T_hard", "bohr_T_hard", "gang_F_hard", "vitc_F_hard", "hert_F_hard", "pucc_F_hard", "troy_T_hard", "moza_F_hard", "croc_F_hard", "gees_F_hard", "lute_F_hard", "memo_F_hard" _ - ) - - Dim easy14 As Variant - easy14 = Array( _ - "lock_T_easy", "hume_F_easy", "papy_T_easy", "sham_T_easy", "list_F_easy", "cons_T_easy", "tsun_T_easy", "pana_T_easy", "kabu_T_easy", "gulf_F_easy", "oedi_T_easy", "vaud_T_easy", "mont_F_easy", "demo_F_easy" _ - ) - - Dim hard14 As Variant - hard14 = Array( _ - "spee_F_hard", "dwar_F_hard", "carb_T_hard", "bohr_T_hard", "gang_F_hard", "vitc_F_hard", "hert_F_hard", "pucc_F_hard", "troy_T_hard", "moza_F_hard", "croc_F_hard", "gees_F_hard", "lute_F_hard", "memo_F_hard" _ - ) - - Dim colsAll As Variant, colsEasy As Variant, colsHard As Variant - colsAll = BuildPresentColArray(all28, ws) - colsEasy = BuildPresentColArray(easy14, ws) - colsHard = BuildPresentColArray(hard14, ws) - - Dim colBS28 As Long, colBSEasy As Long, colBSHard As Long - colBS28 = EnsureOutputColumn(ws, "bs_28") - colBSEasy = EnsureOutputColumn(ws, "bs_easy") - colBSHard = EnsureOutputColumn(ws, "bs_hard") - - Dim lastRow As Long - lastRow = ws.Cells(ws.Rows.Count, "A").End(xlUp).Row - - Dim r As Long - For r = 2 To lastRow - ws.Cells(r, colBS28).Value = MeanOfRow(ws, r, colsAll) - ws.Cells(r, colBSEasy).Value = MeanOfRow(ws, r, colsEasy) - ws.Cells(r, colBSHard).Value = MeanOfRow(ws, r, colsHard) - Next r -End Sub - - diff --git a/.history/eohi1/BS_means_20250922131406.vb b/.history/eohi1/BS_means_20250922131406.vb deleted file mode 100644 index 3bd8303..0000000 --- a/.history/eohi1/BS_means_20250922131406.vb +++ /dev/null @@ -1,118 +0,0 @@ -Option Explicit - -Private Function GetColIndex(ByVal headerName As String, ByVal ws As Worksheet) As Long - Dim lastCol As Long - lastCol = ws.Cells(1, ws.Columns.Count).End(xlToLeft).Column - Dim m As Variant - m = Application.Match(headerName, ws.Range(ws.Cells(1, 1), ws.Cells(1, lastCol)), 0) - If IsError(m) Then - GetColIndex = 0 - Else - GetColIndex = CLng(m) - End If -End Function - -Private Function BuildPresentColArray(ByVal headers As Variant, ByVal ws As Worksheet) As Variant - Dim tmp() As Long - ReDim tmp(0 To UBound(headers)) - Dim i As Long, c As Long - c = 0 - For i = LBound(headers) To UBound(headers) - Dim colIdx As Long - colIdx = GetColIndex(CStr(headers(i)), ws) - If colIdx > 0 Then - tmp(c) = colIdx - c = c + 1 - End If - Next i - If c = 0 Then - BuildPresentColArray = Array() - Else - Dim outArr() As Long - ReDim outArr(0 To c - 1) - For i = 0 To c - 1 - outArr(i) = tmp(i) - Next i - BuildPresentColArray = outArr - End If -End Function - -Private Function MeanOfRow(ByVal ws As Worksheet, ByVal rowIndex As Long, ByVal colIndexes As Variant) As Variant - Dim i As Long - Dim sumVals As Double - Dim countVals As Long - sumVals = 0 - countVals = 0 - If IsArray(colIndexes) Then - For i = LBound(colIndexes) To UBound(colIndexes) - Dim v As Variant - v = ws.Cells(rowIndex, CLng(colIndexes(i))).Value - If Not IsError(v) Then - If IsNumeric(v) Then - sumVals = sumVals + CDbl(v) - countVals = countVals + 1 - End If - End If - Next i - End If - If countVals = 0 Then - MeanOfRow = CVErr(xlErrNA) - Else - MeanOfRow = sumVals / countVals - End If -End Function - -Private Function EnsureOutputColumn(ByVal ws As Worksheet, ByVal headerName As String) As Long - Dim c As Long - c = GetColIndex(headerName, ws) - If c = 0 Then - Dim lastCol As Long - lastCol = ws.Cells(1, ws.Columns.Count).End(xlToLeft).Column - c = lastCol + 1 - ws.Cells(1, c).Value = headerName - End If - EnsureOutputColumn = c -End Function - -Sub BS_Means() - Dim ws As Worksheet - Set ws = ThisWorkbook.Sheets(1) - - Dim all28 As Variant - all28 = Array( _ - "lock_T_easy", "hume_F_easy", "papy_T_easy", "sham_T_easy", "list_F_easy", "cons_T_easy", "tsun_T_easy", "pana_T_easy", "kabu_T_easy", "gulf_F_easy", "oedi_T_easy", "vaud_T_easy", "mont_F_easy", "demo_F_easy", _ - "spee_F_hard", "dwar_F_hard", "carb_T_hard", "bohr_T_hard", "gang_F_hard", "vitc_F_hard", "hert_F_hard", "pucc_F_hard", "troy_T_hard", "moza_F_hard", "croc_F_hard", "gees_F_hard", "lute_F_hard", "memo_F_hard" _ - ) - - Dim easy14 As Variant - easy14 = Array( _ - "lock_T_easy", "hume_F_easy", "papy_T_easy", "sham_T_easy", "list_F_easy", "cons_T_easy", "tsun_T_easy", "pana_T_easy", "kabu_T_easy", "gulf_F_easy", "oedi_T_easy", "vaud_T_easy", "mont_F_easy", "demo_F_easy" _ - ) - - Dim hard14 As Variant - hard14 = Array( _ - "spee_F_hard", "dwar_F_hard", "carb_T_hard", "bohr_T_hard", "gang_F_hard", "vitc_F_hard", "hert_F_hard", "pucc_F_hard", "troy_T_hard", "moza_F_hard", "croc_F_hard", "gees_F_hard", "lute_F_hard", "memo_F_hard" _ - ) - - Dim colsAll As Variant, colsEasy As Variant, colsHard As Variant - colsAll = BuildPresentColArray(all28, ws) - colsEasy = BuildPresentColArray(easy14, ws) - colsHard = BuildPresentColArray(hard14, ws) - - Dim colBS28 As Long, colBSEasy As Long, colBSHard As Long - colBS28 = EnsureOutputColumn(ws, "bs_28") - colBSEasy = EnsureOutputColumn(ws, "bs_easy") - colBSHard = EnsureOutputColumn(ws, "bs_hard") - - Dim lastRow As Long - lastRow = ws.Cells(ws.Rows.Count, "A").End(xlUp).Row - - Dim r As Long - For r = 2 To lastRow - ws.Cells(r, colBS28).Value = MeanOfRow(ws, r, colsAll) - ws.Cells(r, colBSEasy).Value = MeanOfRow(ws, r, colsEasy) - ws.Cells(r, colBSHard).Value = MeanOfRow(ws, r, colsHard) - Next r -End Sub - - diff --git a/.history/eohi1/DataP 01 - domain mean totals _20251007232436.r b/.history/eohi1/DataP 01 - domain mean totals _20251007232436.r deleted file mode 100644 index f4da6af..0000000 --- a/.history/eohi1/DataP 01 - domain mean totals _20251007232436.r +++ /dev/null @@ -1,25 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Calculate NPast_mean_total as average of NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -exp1_data$NPast_mean_total <- rowMeans(exp1_data[, c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life")], na.rm = TRUE) - -# Calculate NFut_mean_total as average of NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life -exp1_data$NFut_mean_total <- rowMeans(exp1_data[, c("NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life")], na.rm = TRUE) - -# Save the updated data -write.csv(exp1_data, "exp1.csv", row.names = FALSE) - -# Display summary of the calculated totals -cat("NPast_mean_total summary:\n") -summary(exp1_data$NPast_mean_total) -cat("\nNFut_mean_total summary:\n") -summary(exp1_data$NFut_mean_total) - -# Show first few rows to verify calculations -cat("\nFirst 5 rows of calculated totals:\n") -print(exp1_data[1:5, c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", "NPast_mean_total", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life", "NFut_mean_total")]) diff --git a/.history/eohi1/DataP 01 - domain mean totals _20251007232440.r b/.history/eohi1/DataP 01 - domain mean totals _20251007232440.r deleted file mode 100644 index f4da6af..0000000 --- a/.history/eohi1/DataP 01 - domain mean totals _20251007232440.r +++ /dev/null @@ -1,25 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Calculate NPast_mean_total as average of NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -exp1_data$NPast_mean_total <- rowMeans(exp1_data[, c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life")], na.rm = TRUE) - -# Calculate NFut_mean_total as average of NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life -exp1_data$NFut_mean_total <- rowMeans(exp1_data[, c("NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life")], na.rm = TRUE) - -# Save the updated data -write.csv(exp1_data, "exp1.csv", row.names = FALSE) - -# Display summary of the calculated totals -cat("NPast_mean_total summary:\n") -summary(exp1_data$NPast_mean_total) -cat("\nNFut_mean_total summary:\n") -summary(exp1_data$NFut_mean_total) - -# Show first few rows to verify calculations -cat("\nFirst 5 rows of calculated totals:\n") -print(exp1_data[1:5, c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", "NPast_mean_total", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life", "NFut_mean_total")]) diff --git a/.history/eohi1/DataP 01 - domain mean totals _20251007232447.r b/.history/eohi1/DataP 01 - domain mean totals _20251007232447.r deleted file mode 100644 index f4da6af..0000000 --- a/.history/eohi1/DataP 01 - domain mean totals _20251007232447.r +++ /dev/null @@ -1,25 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Calculate NPast_mean_total as average of NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -exp1_data$NPast_mean_total <- rowMeans(exp1_data[, c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life")], na.rm = TRUE) - -# Calculate NFut_mean_total as average of NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life -exp1_data$NFut_mean_total <- rowMeans(exp1_data[, c("NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life")], na.rm = TRUE) - -# Save the updated data -write.csv(exp1_data, "exp1.csv", row.names = FALSE) - -# Display summary of the calculated totals -cat("NPast_mean_total summary:\n") -summary(exp1_data$NPast_mean_total) -cat("\nNFut_mean_total summary:\n") -summary(exp1_data$NFut_mean_total) - -# Show first few rows to verify calculations -cat("\nFirst 5 rows of calculated totals:\n") -print(exp1_data[1:5, c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", "NPast_mean_total", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life", "NFut_mean_total")]) diff --git a/.history/eohi1/DataP 01 - domain mean totals _20251007232757.r b/.history/eohi1/DataP 01 - domain mean totals _20251007232757.r deleted file mode 100644 index f4da6af..0000000 --- a/.history/eohi1/DataP 01 - domain mean totals _20251007232757.r +++ /dev/null @@ -1,25 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Calculate NPast_mean_total as average of NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -exp1_data$NPast_mean_total <- rowMeans(exp1_data[, c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life")], na.rm = TRUE) - -# Calculate NFut_mean_total as average of NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life -exp1_data$NFut_mean_total <- rowMeans(exp1_data[, c("NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life")], na.rm = TRUE) - -# Save the updated data -write.csv(exp1_data, "exp1.csv", row.names = FALSE) - -# Display summary of the calculated totals -cat("NPast_mean_total summary:\n") -summary(exp1_data$NPast_mean_total) -cat("\nNFut_mean_total summary:\n") -summary(exp1_data$NFut_mean_total) - -# Show first few rows to verify calculations -cat("\nFirst 5 rows of calculated totals:\n") -print(exp1_data[1:5, c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", "NPast_mean_total", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life", "NFut_mean_total")]) diff --git a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193424.rmd b/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193424.rmd deleted file mode 100644 index 5bb88d4..0000000 --- a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193424.rmd +++ /dev/null @@ -1,82 +0,0 @@ ---- -title: "Mixed ANOVA Analysis for Domain Means" -author: "Irina" -date: "`r Sys.Date()`" -output: - html_document: - toc: true - toc_float: true - code_folding: hide - theme: flatly - highlight: tango - fig_width: 10 - fig_height: 6 ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE) -``` - -# Introduction - -This analysis examines domain-level differences in mean scores across time periods using a mixed ANOVA design. The analysis focuses on four domains (Preferences, Personality, Values, Life) across two time periods (Past, Future) with a between-subjects factor (TEMPORAL_DO). - -# Data Preparation and Setup - -```{r libraries} -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(ggplot2) # For plotting - -options(scipen = 999) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -``` - -```{r data-loading} -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) -``` - -```{r data-reshaping} -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Create clean dataset for analysis (fixing the reference issue) -long_data_clean <- long_data -``` - diff --git a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193431.rmd b/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193431.rmd deleted file mode 100644 index 648a864..0000000 --- a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193431.rmd +++ /dev/null @@ -1,121 +0,0 @@ ---- -title: "Mixed ANOVA Analysis for Domain Means" -author: "Irina" -date: "`r Sys.Date()`" -output: - html_document: - toc: true - toc_float: true - code_folding: hide - theme: flatly - highlight: tango - fig_width: 10 - fig_height: 6 ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE) -``` - -# Introduction - -This analysis examines domain-level differences in mean scores across time periods using a mixed ANOVA design. The analysis focuses on four domains (Preferences, Personality, Values, Life) across two time periods (Past, Future) with a between-subjects factor (TEMPORAL_DO). - -# Data Preparation and Setup - -```{r libraries} -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(ggplot2) # For plotting - -options(scipen = 999) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -``` - -```{r data-loading} -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) -``` - -```{r data-reshaping} -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Create clean dataset for analysis (fixing the reference issue) -long_data_clean <- long_data -``` - -# Descriptive Statistics - -## Overall Descriptive Statistics by TIME and DOMAIN - -```{r descriptive-stats} -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) -``` - -## Descriptive Statistics by Between-Subjects Factors - -```{r descriptive-stats-temporal} -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) -``` - diff --git a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193438.rmd b/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193438.rmd deleted file mode 100644 index a30a9ee..0000000 --- a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193438.rmd +++ /dev/null @@ -1,175 +0,0 @@ ---- -title: "Mixed ANOVA Analysis for Domain Means" -author: "Irina" -date: "`r Sys.Date()`" -output: - html_document: - toc: true - toc_float: true - code_folding: hide - theme: flatly - highlight: tango - fig_width: 10 - fig_height: 6 ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE) -``` - -# Introduction - -This analysis examines domain-level differences in mean scores across time periods using a mixed ANOVA design. The analysis focuses on four domains (Preferences, Personality, Values, Life) across two time periods (Past, Future) with a between-subjects factor (TEMPORAL_DO). - -# Data Preparation and Setup - -```{r libraries} -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(ggplot2) # For plotting - -options(scipen = 999) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -``` - -```{r data-loading} -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) -``` - -```{r data-reshaping} -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Create clean dataset for analysis (fixing the reference issue) -long_data_clean <- long_data -``` - -# Descriptive Statistics - -## Overall Descriptive Statistics by TIME and DOMAIN - -```{r descriptive-stats} -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) -``` - -## Descriptive Statistics by Between-Subjects Factors - -```{r descriptive-stats-temporal} -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) -``` - -# Assumption Testing - -## Missing Values Check - -```{r missing-values} -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) -``` - -## Outlier Detection - -```{r outlier-detection} -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) -``` - -## Anderson-Darling Normality Test - -```{r normality-test} -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) -``` - diff --git a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193443.rmd b/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193443.rmd deleted file mode 100644 index 8e6fd70..0000000 --- a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193443.rmd +++ /dev/null @@ -1,205 +0,0 @@ ---- -title: "Mixed ANOVA Analysis for Domain Means" -author: "Irina" -date: "`r Sys.Date()`" -output: - html_document: - toc: true - toc_float: true - code_folding: hide - theme: flatly - highlight: tango - fig_width: 10 - fig_height: 6 ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE) -``` - -# Introduction - -This analysis examines domain-level differences in mean scores across time periods using a mixed ANOVA design. The analysis focuses on four domains (Preferences, Personality, Values, Life) across two time periods (Past, Future) with a between-subjects factor (TEMPORAL_DO). - -# Data Preparation and Setup - -```{r libraries} -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(ggplot2) # For plotting - -options(scipen = 999) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -``` - -```{r data-loading} -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) -``` - -```{r data-reshaping} -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Create clean dataset for analysis (fixing the reference issue) -long_data_clean <- long_data -``` - -# Descriptive Statistics - -## Overall Descriptive Statistics by TIME and DOMAIN - -```{r descriptive-stats} -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) -``` - -## Descriptive Statistics by Between-Subjects Factors - -```{r descriptive-stats-temporal} -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) -``` - -# Assumption Testing - -## Missing Values Check - -```{r missing-values} -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) -``` - -## Outlier Detection - -```{r outlier-detection} -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) -``` - -## Anderson-Darling Normality Test - -```{r normality-test} -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) -``` - -## Homogeneity of Variance (Levene's Test) - -### Test homogeneity across TIME within each DOMAIN - -```{r homogeneity-time} -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) -``` - -### Test homogeneity across DOMAIN within each TIME - -```{r homogeneity-domain} -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) -``` - diff --git a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193454.rmd b/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193454.rmd deleted file mode 100644 index c1a144f..0000000 --- a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193454.rmd +++ /dev/null @@ -1,310 +0,0 @@ ---- -title: "Mixed ANOVA Analysis for Domain Means" -author: "Irina" -date: "`r Sys.Date()`" -output: - html_document: - toc: true - toc_float: true - code_folding: hide - theme: flatly - highlight: tango - fig_width: 10 - fig_height: 6 ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE) -``` - -# Introduction - -This analysis examines domain-level differences in mean scores across time periods using a mixed ANOVA design. The analysis focuses on four domains (Preferences, Personality, Values, Life) across two time periods (Past, Future) with a between-subjects factor (TEMPORAL_DO). - -# Data Preparation and Setup - -```{r libraries} -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(ggplot2) # For plotting - -options(scipen = 999) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -``` - -```{r data-loading} -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) -``` - -```{r data-reshaping} -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Create clean dataset for analysis (fixing the reference issue) -long_data_clean <- long_data -``` - -# Descriptive Statistics - -## Overall Descriptive Statistics by TIME and DOMAIN - -```{r descriptive-stats} -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) -``` - -## Descriptive Statistics by Between-Subjects Factors - -```{r descriptive-stats-temporal} -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) -``` - -# Assumption Testing - -## Missing Values Check - -```{r missing-values} -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) -``` - -## Outlier Detection - -```{r outlier-detection} -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) -``` - -## Anderson-Darling Normality Test - -```{r normality-test} -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) -``` - -## Homogeneity of Variance (Levene's Test) - -### Test homogeneity across TIME within each DOMAIN - -```{r homogeneity-time} -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) -``` - -### Test homogeneity across DOMAIN within each TIME - -```{r homogeneity-domain} -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) -``` - -## Hartley's F-Max Test with Bootstrap Critical Values - -```{r hartley-function} -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} -``` - -```{r hartley-results} -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) -``` - diff --git a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193500.rmd b/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193500.rmd deleted file mode 100644 index e94b186..0000000 --- a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193500.rmd +++ /dev/null @@ -1,348 +0,0 @@ ---- -title: "Mixed ANOVA Analysis for Domain Means" -author: "Irina" -date: "`r Sys.Date()`" -output: - html_document: - toc: true - toc_float: true - code_folding: hide - theme: flatly - highlight: tango - fig_width: 10 - fig_height: 6 ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE) -``` - -# Introduction - -This analysis examines domain-level differences in mean scores across time periods using a mixed ANOVA design. The analysis focuses on four domains (Preferences, Personality, Values, Life) across two time periods (Past, Future) with a between-subjects factor (TEMPORAL_DO). - -# Data Preparation and Setup - -```{r libraries} -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(ggplot2) # For plotting - -options(scipen = 999) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -``` - -```{r data-loading} -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) -``` - -```{r data-reshaping} -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Create clean dataset for analysis (fixing the reference issue) -long_data_clean <- long_data -``` - -# Descriptive Statistics - -## Overall Descriptive Statistics by TIME and DOMAIN - -```{r descriptive-stats} -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) -``` - -## Descriptive Statistics by Between-Subjects Factors - -```{r descriptive-stats-temporal} -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) -``` - -# Assumption Testing - -## Missing Values Check - -```{r missing-values} -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) -``` - -## Outlier Detection - -```{r outlier-detection} -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) -``` - -## Anderson-Darling Normality Test - -```{r normality-test} -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) -``` - -## Homogeneity of Variance (Levene's Test) - -### Test homogeneity across TIME within each DOMAIN - -```{r homogeneity-time} -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) -``` - -### Test homogeneity across DOMAIN within each TIME - -```{r homogeneity-domain} -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) -``` - -## Hartley's F-Max Test with Bootstrap Critical Values - -```{r hartley-function} -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} -``` - -```{r hartley-results} -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) -``` - -# Mixed ANOVA Analysis - -## Design Balance Check - -```{r design-check} -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} -``` - -## Mixed ANOVA with Sphericity Corrections - -```{r mixed-anova} -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) -``` - diff --git a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193511.rmd b/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193511.rmd deleted file mode 100644 index 713ff88..0000000 --- a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193511.rmd +++ /dev/null @@ -1,430 +0,0 @@ ---- -title: "Mixed ANOVA Analysis for Domain Means" -author: "Irina" -date: "`r Sys.Date()`" -output: - html_document: - toc: true - toc_float: true - code_folding: hide - theme: flatly - highlight: tango - fig_width: 10 - fig_height: 6 ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE) -``` - -# Introduction - -This analysis examines domain-level differences in mean scores across time periods using a mixed ANOVA design. The analysis focuses on four domains (Preferences, Personality, Values, Life) across two time periods (Past, Future) with a between-subjects factor (TEMPORAL_DO). - -# Data Preparation and Setup - -```{r libraries} -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(ggplot2) # For plotting - -options(scipen = 999) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -``` - -```{r data-loading} -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) -``` - -```{r data-reshaping} -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Create clean dataset for analysis (fixing the reference issue) -long_data_clean <- long_data -``` - -# Descriptive Statistics - -## Overall Descriptive Statistics by TIME and DOMAIN - -```{r descriptive-stats} -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) -``` - -## Descriptive Statistics by Between-Subjects Factors - -```{r descriptive-stats-temporal} -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) -``` - -# Assumption Testing - -## Missing Values Check - -```{r missing-values} -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) -``` - -## Outlier Detection - -```{r outlier-detection} -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) -``` - -## Anderson-Darling Normality Test - -```{r normality-test} -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) -``` - -## Homogeneity of Variance (Levene's Test) - -### Test homogeneity across TIME within each DOMAIN - -```{r homogeneity-time} -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) -``` - -### Test homogeneity across DOMAIN within each TIME - -```{r homogeneity-domain} -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) -``` - -## Hartley's F-Max Test with Bootstrap Critical Values - -```{r hartley-function} -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} -``` - -```{r hartley-results} -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) -``` - -# Mixed ANOVA Analysis - -## Design Balance Check - -```{r design-check} -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} -``` - -## Mixed ANOVA with Sphericity Corrections - -```{r mixed-anova} -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) -``` - -## Mauchly's Test for Sphericity - -```{r mauchly-test} -print(mixed_anova_model$Mauchly) -``` - -## Sphericity-Corrected Results - -```{r sphericity-corrections} -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} -``` - diff --git a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193523.rmd b/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193523.rmd deleted file mode 100644 index a4886c1..0000000 --- a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193523.rmd +++ /dev/null @@ -1,497 +0,0 @@ ---- -title: "Mixed ANOVA Analysis for Domain Means" -author: "Irina" -date: "`r Sys.Date()`" -output: - html_document: - toc: true - toc_float: true - code_folding: hide - theme: flatly - highlight: tango - fig_width: 10 - fig_height: 6 ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE) -``` - -# Introduction - -This analysis examines domain-level differences in mean scores across time periods using a mixed ANOVA design. The analysis focuses on four domains (Preferences, Personality, Values, Life) across two time periods (Past, Future) with a between-subjects factor (TEMPORAL_DO). - -# Data Preparation and Setup - -```{r libraries} -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(ggplot2) # For plotting - -options(scipen = 999) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -``` - -```{r data-loading} -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) -``` - -```{r data-reshaping} -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Create clean dataset for analysis (fixing the reference issue) -long_data_clean <- long_data -``` - -# Descriptive Statistics - -## Overall Descriptive Statistics by TIME and DOMAIN - -```{r descriptive-stats} -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) -``` - -## Descriptive Statistics by Between-Subjects Factors - -```{r descriptive-stats-temporal} -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) -``` - -# Assumption Testing - -## Missing Values Check - -```{r missing-values} -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) -``` - -## Outlier Detection - -```{r outlier-detection} -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) -``` - -## Anderson-Darling Normality Test - -```{r normality-test} -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) -``` - -## Homogeneity of Variance (Levene's Test) - -### Test homogeneity across TIME within each DOMAIN - -```{r homogeneity-time} -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) -``` - -### Test homogeneity across DOMAIN within each TIME - -```{r homogeneity-domain} -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) -``` - -## Hartley's F-Max Test with Bootstrap Critical Values - -```{r hartley-function} -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} -``` - -```{r hartley-results} -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) -``` - -# Mixed ANOVA Analysis - -## Design Balance Check - -```{r design-check} -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} -``` - -## Mixed ANOVA with Sphericity Corrections - -```{r mixed-anova} -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) -``` - -## Mauchly's Test for Sphericity - -```{r mauchly-test} -print(mixed_anova_model$Mauchly) -``` - -## Sphericity-Corrected Results - -```{r sphericity-corrections} -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} -``` - -# Effect Sizes (Cohen's d) - -## Main Effects - -```{r cohens-d-main} -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} -``` - -```{r cohens-d-domain} -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} -``` - diff --git a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193543.rmd b/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193543.rmd deleted file mode 100644 index 167e90e..0000000 --- a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193543.rmd +++ /dev/null @@ -1,573 +0,0 @@ ---- -title: "Mixed ANOVA Analysis for Domain Means" -author: "Irina" -date: "`r Sys.Date()`" -output: - html_document: - toc: true - toc_float: true - code_folding: hide - theme: flatly - highlight: tango - fig_width: 10 - fig_height: 6 ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE) -``` - -# Introduction - -This analysis examines domain-level differences in mean scores across time periods using a mixed ANOVA design. The analysis focuses on four domains (Preferences, Personality, Values, Life) across two time periods (Past, Future) with a between-subjects factor (TEMPORAL_DO). - -# Data Preparation and Setup - -```{r libraries} -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(ggplot2) # For plotting - -options(scipen = 999) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -``` - -```{r data-loading} -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) -``` - -```{r data-reshaping} -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Create clean dataset for analysis (fixing the reference issue) -long_data_clean <- long_data -``` - -# Descriptive Statistics - -## Overall Descriptive Statistics by TIME and DOMAIN - -```{r descriptive-stats} -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) -``` - -## Descriptive Statistics by Between-Subjects Factors - -```{r descriptive-stats-temporal} -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) -``` - -# Assumption Testing - -## Missing Values Check - -```{r missing-values} -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) -``` - -## Outlier Detection - -```{r outlier-detection} -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) -``` - -## Anderson-Darling Normality Test - -```{r normality-test} -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) -``` - -## Homogeneity of Variance (Levene's Test) - -### Test homogeneity across TIME within each DOMAIN - -```{r homogeneity-time} -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) -``` - -### Test homogeneity across DOMAIN within each TIME - -```{r homogeneity-domain} -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) -``` - -## Hartley's F-Max Test with Bootstrap Critical Values - -```{r hartley-function} -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} -``` - -```{r hartley-results} -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) -``` - -# Mixed ANOVA Analysis - -## Design Balance Check - -```{r design-check} -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} -``` - -## Mixed ANOVA with Sphericity Corrections - -```{r mixed-anova} -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) -``` - -## Mauchly's Test for Sphericity - -```{r mauchly-test} -print(mixed_anova_model$Mauchly) -``` - -## Sphericity-Corrected Results - -```{r sphericity-corrections} -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} -``` - -# Effect Sizes (Cohen's d) - -## Main Effects - -```{r cohens-d-main} -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} -``` - -```{r cohens-d-domain} -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} -``` - -## Two-Way Interactions - -```{r cohens-d-function} -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} -``` - -```{r interaction-effects} -# Note: These sections would need the actual simple effects results from your analysis -# The original script references undefined variables: temporal_time_simple and time_domain_simple -# These would need to be calculated using emmeans for simple effects - -# 1. TEMPORAL_DO × TIME INTERACTION -# temporal_time_simple <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -# temporal_time_simple_df <- as.data.frame(pairs(temporal_time_simple, adjust = "bonferroni")) -# calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION -# time_domain_simple <- emmeans(aov_model, ~ DOMAIN | TIME) -# time_domain_simple_df <- as.data.frame(pairs(time_domain_simple, adjust = "bonferroni")) -# calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") -``` - diff --git a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193554.rmd b/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193554.rmd deleted file mode 100644 index b5c49d3..0000000 --- a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193554.rmd +++ /dev/null @@ -1,660 +0,0 @@ ---- -title: "Mixed ANOVA Analysis for Domain Means" -author: "Irina" -date: "`r Sys.Date()`" -output: - html_document: - toc: true - toc_float: true - code_folding: hide - theme: flatly - highlight: tango - fig_width: 10 - fig_height: 6 ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE) -``` - -# Introduction - -This analysis examines domain-level differences in mean scores across time periods using a mixed ANOVA design. The analysis focuses on four domains (Preferences, Personality, Values, Life) across two time periods (Past, Future) with a between-subjects factor (TEMPORAL_DO). - -# Data Preparation and Setup - -```{r libraries} -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(ggplot2) # For plotting - -options(scipen = 999) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -``` - -```{r data-loading} -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) -``` - -```{r data-reshaping} -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Create clean dataset for analysis (fixing the reference issue) -long_data_clean <- long_data -``` - -# Descriptive Statistics - -## Overall Descriptive Statistics by TIME and DOMAIN - -```{r descriptive-stats} -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) -``` - -## Descriptive Statistics by Between-Subjects Factors - -```{r descriptive-stats-temporal} -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) -``` - -# Assumption Testing - -## Missing Values Check - -```{r missing-values} -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) -``` - -## Outlier Detection - -```{r outlier-detection} -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) -``` - -## Anderson-Darling Normality Test - -```{r normality-test} -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) -``` - -## Homogeneity of Variance (Levene's Test) - -### Test homogeneity across TIME within each DOMAIN - -```{r homogeneity-time} -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) -``` - -### Test homogeneity across DOMAIN within each TIME - -```{r homogeneity-domain} -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) -``` - -## Hartley's F-Max Test with Bootstrap Critical Values - -```{r hartley-function} -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} -``` - -```{r hartley-results} -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) -``` - -# Mixed ANOVA Analysis - -## Design Balance Check - -```{r design-check} -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} -``` - -## Mixed ANOVA with Sphericity Corrections - -```{r mixed-anova} -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) -``` - -## Mauchly's Test for Sphericity - -```{r mauchly-test} -print(mixed_anova_model$Mauchly) -``` - -## Sphericity-Corrected Results - -```{r sphericity-corrections} -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} -``` - -# Effect Sizes (Cohen's d) - -## Main Effects - -```{r cohens-d-main} -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} -``` - -```{r cohens-d-domain} -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} -``` - -## Two-Way Interactions - -```{r cohens-d-function} -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} -``` - -```{r interaction-effects} -# Note: These sections would need the actual simple effects results from your analysis -# The original script references undefined variables: temporal_time_simple and time_domain_simple -# These would need to be calculated using emmeans for simple effects - -# 1. TEMPORAL_DO × TIME INTERACTION -# temporal_time_simple <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -# temporal_time_simple_df <- as.data.frame(pairs(temporal_time_simple, adjust = "bonferroni")) -# calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION -# time_domain_simple <- emmeans(aov_model, ~ DOMAIN | TIME) -# time_domain_simple_df <- as.data.frame(pairs(time_domain_simple, adjust = "bonferroni")) -# calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") -``` - -# Interaction Plot - -```{r interaction-plot} -# Define color palette for DOMAIN (4 levels) -cbp1 <- c("#648FFF", "#DC267F", "#FFB000", "#FE6100", "#785EF0") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# Create estimated marginal means for DOMAIN x TIME -emm_full <- emmeans(aov_model, ~ DOMAIN * TIME) - -# Prepare emmeans data frame -emmeans_data2 <- emm_full %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -iPlot <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Plot without TEMPORAL_DO facet -interaction_plot2 <- ggplot() + - geom_point( - data = iPlot, - aes(x = TIME, y = MEAN_DIFFERENCE, color = DOMAIN), - position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.2), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_data2, - aes( - xmin = as.numeric(TIME) - 0.08 + (as.numeric(DOMAIN) - 2.5) * 0.15, - xmax = as.numeric(TIME) + 0.08 + (as.numeric(DOMAIN) - 2.5) * 0.15, - ymin = ci_lower, ymax = ci_upper, - fill = DOMAIN - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - xend = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - y = plot_mean, - color = DOMAIN, - shape = DOMAIN - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "TIME", y = "Mean Difference", - title = "DOMAIN × TIME Interaction", subtitle = "" - ) + - scale_color_manual(name = "DOMAIN", values = cbp1) + - scale_fill_manual(name = "DOMAIN", values = cbp1) + - scale_shape_manual(name = "DOMAIN", values = c(21, 22, 23, 24)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5), - plot.subtitle = element_text(size = 12, hjust = 0.5) - ) - -print(interaction_plot2) -``` - diff --git a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193559.rmd b/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193559.rmd deleted file mode 100644 index b5c49d3..0000000 --- a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193559.rmd +++ /dev/null @@ -1,660 +0,0 @@ ---- -title: "Mixed ANOVA Analysis for Domain Means" -author: "Irina" -date: "`r Sys.Date()`" -output: - html_document: - toc: true - toc_float: true - code_folding: hide - theme: flatly - highlight: tango - fig_width: 10 - fig_height: 6 ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE) -``` - -# Introduction - -This analysis examines domain-level differences in mean scores across time periods using a mixed ANOVA design. The analysis focuses on four domains (Preferences, Personality, Values, Life) across two time periods (Past, Future) with a between-subjects factor (TEMPORAL_DO). - -# Data Preparation and Setup - -```{r libraries} -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(ggplot2) # For plotting - -options(scipen = 999) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -``` - -```{r data-loading} -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) -``` - -```{r data-reshaping} -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Create clean dataset for analysis (fixing the reference issue) -long_data_clean <- long_data -``` - -# Descriptive Statistics - -## Overall Descriptive Statistics by TIME and DOMAIN - -```{r descriptive-stats} -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) -``` - -## Descriptive Statistics by Between-Subjects Factors - -```{r descriptive-stats-temporal} -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) -``` - -# Assumption Testing - -## Missing Values Check - -```{r missing-values} -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) -``` - -## Outlier Detection - -```{r outlier-detection} -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) -``` - -## Anderson-Darling Normality Test - -```{r normality-test} -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) -``` - -## Homogeneity of Variance (Levene's Test) - -### Test homogeneity across TIME within each DOMAIN - -```{r homogeneity-time} -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) -``` - -### Test homogeneity across DOMAIN within each TIME - -```{r homogeneity-domain} -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) -``` - -## Hartley's F-Max Test with Bootstrap Critical Values - -```{r hartley-function} -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} -``` - -```{r hartley-results} -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) -``` - -# Mixed ANOVA Analysis - -## Design Balance Check - -```{r design-check} -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} -``` - -## Mixed ANOVA with Sphericity Corrections - -```{r mixed-anova} -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) -``` - -## Mauchly's Test for Sphericity - -```{r mauchly-test} -print(mixed_anova_model$Mauchly) -``` - -## Sphericity-Corrected Results - -```{r sphericity-corrections} -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} -``` - -# Effect Sizes (Cohen's d) - -## Main Effects - -```{r cohens-d-main} -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} -``` - -```{r cohens-d-domain} -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} -``` - -## Two-Way Interactions - -```{r cohens-d-function} -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} -``` - -```{r interaction-effects} -# Note: These sections would need the actual simple effects results from your analysis -# The original script references undefined variables: temporal_time_simple and time_domain_simple -# These would need to be calculated using emmeans for simple effects - -# 1. TEMPORAL_DO × TIME INTERACTION -# temporal_time_simple <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -# temporal_time_simple_df <- as.data.frame(pairs(temporal_time_simple, adjust = "bonferroni")) -# calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION -# time_domain_simple <- emmeans(aov_model, ~ DOMAIN | TIME) -# time_domain_simple_df <- as.data.frame(pairs(time_domain_simple, adjust = "bonferroni")) -# calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") -``` - -# Interaction Plot - -```{r interaction-plot} -# Define color palette for DOMAIN (4 levels) -cbp1 <- c("#648FFF", "#DC267F", "#FFB000", "#FE6100", "#785EF0") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# Create estimated marginal means for DOMAIN x TIME -emm_full <- emmeans(aov_model, ~ DOMAIN * TIME) - -# Prepare emmeans data frame -emmeans_data2 <- emm_full %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -iPlot <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Plot without TEMPORAL_DO facet -interaction_plot2 <- ggplot() + - geom_point( - data = iPlot, - aes(x = TIME, y = MEAN_DIFFERENCE, color = DOMAIN), - position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.2), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_data2, - aes( - xmin = as.numeric(TIME) - 0.08 + (as.numeric(DOMAIN) - 2.5) * 0.15, - xmax = as.numeric(TIME) + 0.08 + (as.numeric(DOMAIN) - 2.5) * 0.15, - ymin = ci_lower, ymax = ci_upper, - fill = DOMAIN - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - xend = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - y = plot_mean, - color = DOMAIN, - shape = DOMAIN - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "TIME", y = "Mean Difference", - title = "DOMAIN × TIME Interaction", subtitle = "" - ) + - scale_color_manual(name = "DOMAIN", values = cbp1) + - scale_fill_manual(name = "DOMAIN", values = cbp1) + - scale_shape_manual(name = "DOMAIN", values = c(21, 22, 23, 24)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5), - plot.subtitle = element_text(size = 12, hjust = 0.5) - ) - -print(interaction_plot2) -``` - diff --git a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193613.rmd b/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193613.rmd deleted file mode 100644 index b5c49d3..0000000 --- a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193613.rmd +++ /dev/null @@ -1,660 +0,0 @@ ---- -title: "Mixed ANOVA Analysis for Domain Means" -author: "Irina" -date: "`r Sys.Date()`" -output: - html_document: - toc: true - toc_float: true - code_folding: hide - theme: flatly - highlight: tango - fig_width: 10 - fig_height: 6 ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE) -``` - -# Introduction - -This analysis examines domain-level differences in mean scores across time periods using a mixed ANOVA design. The analysis focuses on four domains (Preferences, Personality, Values, Life) across two time periods (Past, Future) with a between-subjects factor (TEMPORAL_DO). - -# Data Preparation and Setup - -```{r libraries} -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(ggplot2) # For plotting - -options(scipen = 999) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -``` - -```{r data-loading} -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) -``` - -```{r data-reshaping} -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Create clean dataset for analysis (fixing the reference issue) -long_data_clean <- long_data -``` - -# Descriptive Statistics - -## Overall Descriptive Statistics by TIME and DOMAIN - -```{r descriptive-stats} -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) -``` - -## Descriptive Statistics by Between-Subjects Factors - -```{r descriptive-stats-temporal} -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) -``` - -# Assumption Testing - -## Missing Values Check - -```{r missing-values} -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) -``` - -## Outlier Detection - -```{r outlier-detection} -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) -``` - -## Anderson-Darling Normality Test - -```{r normality-test} -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) -``` - -## Homogeneity of Variance (Levene's Test) - -### Test homogeneity across TIME within each DOMAIN - -```{r homogeneity-time} -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) -``` - -### Test homogeneity across DOMAIN within each TIME - -```{r homogeneity-domain} -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) -``` - -## Hartley's F-Max Test with Bootstrap Critical Values - -```{r hartley-function} -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} -``` - -```{r hartley-results} -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) -``` - -# Mixed ANOVA Analysis - -## Design Balance Check - -```{r design-check} -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} -``` - -## Mixed ANOVA with Sphericity Corrections - -```{r mixed-anova} -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) -``` - -## Mauchly's Test for Sphericity - -```{r mauchly-test} -print(mixed_anova_model$Mauchly) -``` - -## Sphericity-Corrected Results - -```{r sphericity-corrections} -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} -``` - -# Effect Sizes (Cohen's d) - -## Main Effects - -```{r cohens-d-main} -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} -``` - -```{r cohens-d-domain} -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} -``` - -## Two-Way Interactions - -```{r cohens-d-function} -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} -``` - -```{r interaction-effects} -# Note: These sections would need the actual simple effects results from your analysis -# The original script references undefined variables: temporal_time_simple and time_domain_simple -# These would need to be calculated using emmeans for simple effects - -# 1. TEMPORAL_DO × TIME INTERACTION -# temporal_time_simple <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -# temporal_time_simple_df <- as.data.frame(pairs(temporal_time_simple, adjust = "bonferroni")) -# calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION -# time_domain_simple <- emmeans(aov_model, ~ DOMAIN | TIME) -# time_domain_simple_df <- as.data.frame(pairs(time_domain_simple, adjust = "bonferroni")) -# calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") -``` - -# Interaction Plot - -```{r interaction-plot} -# Define color palette for DOMAIN (4 levels) -cbp1 <- c("#648FFF", "#DC267F", "#FFB000", "#FE6100", "#785EF0") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# Create estimated marginal means for DOMAIN x TIME -emm_full <- emmeans(aov_model, ~ DOMAIN * TIME) - -# Prepare emmeans data frame -emmeans_data2 <- emm_full %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -iPlot <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Plot without TEMPORAL_DO facet -interaction_plot2 <- ggplot() + - geom_point( - data = iPlot, - aes(x = TIME, y = MEAN_DIFFERENCE, color = DOMAIN), - position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.2), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_data2, - aes( - xmin = as.numeric(TIME) - 0.08 + (as.numeric(DOMAIN) - 2.5) * 0.15, - xmax = as.numeric(TIME) + 0.08 + (as.numeric(DOMAIN) - 2.5) * 0.15, - ymin = ci_lower, ymax = ci_upper, - fill = DOMAIN - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - xend = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - y = plot_mean, - color = DOMAIN, - shape = DOMAIN - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "TIME", y = "Mean Difference", - title = "DOMAIN × TIME Interaction", subtitle = "" - ) + - scale_color_manual(name = "DOMAIN", values = cbp1) + - scale_fill_manual(name = "DOMAIN", values = cbp1) + - scale_shape_manual(name = "DOMAIN", values = c(21, 22, 23, 24)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5), - plot.subtitle = element_text(size = 12, hjust = 0.5) - ) - -print(interaction_plot2) -``` - diff --git a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193658.rmd b/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193658.rmd deleted file mode 100644 index b5c49d3..0000000 --- a/.history/eohi1/RMD exp1 - mixed anova domain means_20251004193658.rmd +++ /dev/null @@ -1,660 +0,0 @@ ---- -title: "Mixed ANOVA Analysis for Domain Means" -author: "Irina" -date: "`r Sys.Date()`" -output: - html_document: - toc: true - toc_float: true - code_folding: hide - theme: flatly - highlight: tango - fig_width: 10 - fig_height: 6 ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE) -``` - -# Introduction - -This analysis examines domain-level differences in mean scores across time periods using a mixed ANOVA design. The analysis focuses on four domains (Preferences, Personality, Values, Life) across two time periods (Past, Future) with a between-subjects factor (TEMPORAL_DO). - -# Data Preparation and Setup - -```{r libraries} -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(ggplot2) # For plotting - -options(scipen = 999) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -``` - -```{r data-loading} -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) -``` - -```{r data-reshaping} -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Create clean dataset for analysis (fixing the reference issue) -long_data_clean <- long_data -``` - -# Descriptive Statistics - -## Overall Descriptive Statistics by TIME and DOMAIN - -```{r descriptive-stats} -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) -``` - -## Descriptive Statistics by Between-Subjects Factors - -```{r descriptive-stats-temporal} -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) -``` - -# Assumption Testing - -## Missing Values Check - -```{r missing-values} -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) -``` - -## Outlier Detection - -```{r outlier-detection} -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) -``` - -## Anderson-Darling Normality Test - -```{r normality-test} -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) -``` - -## Homogeneity of Variance (Levene's Test) - -### Test homogeneity across TIME within each DOMAIN - -```{r homogeneity-time} -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) -``` - -### Test homogeneity across DOMAIN within each TIME - -```{r homogeneity-domain} -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) -``` - -## Hartley's F-Max Test with Bootstrap Critical Values - -```{r hartley-function} -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} -``` - -```{r hartley-results} -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) -``` - -# Mixed ANOVA Analysis - -## Design Balance Check - -```{r design-check} -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} -``` - -## Mixed ANOVA with Sphericity Corrections - -```{r mixed-anova} -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) -``` - -## Mauchly's Test for Sphericity - -```{r mauchly-test} -print(mixed_anova_model$Mauchly) -``` - -## Sphericity-Corrected Results - -```{r sphericity-corrections} -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} -``` - -# Effect Sizes (Cohen's d) - -## Main Effects - -```{r cohens-d-main} -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} -``` - -```{r cohens-d-domain} -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} -``` - -## Two-Way Interactions - -```{r cohens-d-function} -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} -``` - -```{r interaction-effects} -# Note: These sections would need the actual simple effects results from your analysis -# The original script references undefined variables: temporal_time_simple and time_domain_simple -# These would need to be calculated using emmeans for simple effects - -# 1. TEMPORAL_DO × TIME INTERACTION -# temporal_time_simple <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -# temporal_time_simple_df <- as.data.frame(pairs(temporal_time_simple, adjust = "bonferroni")) -# calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION -# time_domain_simple <- emmeans(aov_model, ~ DOMAIN | TIME) -# time_domain_simple_df <- as.data.frame(pairs(time_domain_simple, adjust = "bonferroni")) -# calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") -``` - -# Interaction Plot - -```{r interaction-plot} -# Define color palette for DOMAIN (4 levels) -cbp1 <- c("#648FFF", "#DC267F", "#FFB000", "#FE6100", "#785EF0") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# Create estimated marginal means for DOMAIN x TIME -emm_full <- emmeans(aov_model, ~ DOMAIN * TIME) - -# Prepare emmeans data frame -emmeans_data2 <- emm_full %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -iPlot <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Plot without TEMPORAL_DO facet -interaction_plot2 <- ggplot() + - geom_point( - data = iPlot, - aes(x = TIME, y = MEAN_DIFFERENCE, color = DOMAIN), - position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.2), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_data2, - aes( - xmin = as.numeric(TIME) - 0.08 + (as.numeric(DOMAIN) - 2.5) * 0.15, - xmax = as.numeric(TIME) + 0.08 + (as.numeric(DOMAIN) - 2.5) * 0.15, - ymin = ci_lower, ymax = ci_upper, - fill = DOMAIN - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - xend = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - y = plot_mean, - color = DOMAIN, - shape = DOMAIN - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "TIME", y = "Mean Difference", - title = "DOMAIN × TIME Interaction", subtitle = "" - ) + - scale_color_manual(name = "DOMAIN", values = cbp1) + - scale_fill_manual(name = "DOMAIN", values = cbp1) + - scale_shape_manual(name = "DOMAIN", values = c(21, 22, 23, 24)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5), - plot.subtitle = element_text(size = 12, hjust = 0.5) - ) - -print(interaction_plot2) -``` - diff --git a/.history/eohi1/Untitled-1_20251020173338.r b/.history/eohi1/Untitled-1_20251020173338.r deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi1/Untitled-1_20251020173353.r b/.history/eohi1/Untitled-1_20251020173353.r deleted file mode 100644 index 8b82aee..0000000 --- a/.history/eohi1/Untitled-1_20251020173353.r +++ /dev/null @@ -1,15 +0,0 @@ -options(scipen = 999) - -library(dplyr) -library(car) -library(lmtest) -library(stargazer) -library(sandwich) -library(lmtest) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_edu) \ No newline at end of file diff --git a/.history/eohi1/assumption_checks_before_cronbach_20250917154857.r b/.history/eohi1/assumption_checks_before_cronbach_20250917154857.r deleted file mode 100644 index 5dde602..0000000 --- a/.history/eohi1/assumption_checks_before_cronbach_20250917154857.r +++ /dev/null @@ -1,162 +0,0 @@ -# Assumption Checks Before Cronbach's Alpha Analysis -# Run this BEFORE the main reliability analysis - -library(psych) -library(corrplot) -library(ggplot2) - -# Read the data -data <- read.csv("exp1.csv") - -# Define scale variables -past_pref_vars <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", - "NPastDiff_pref_nap", "NPastDiff_pref_travel") - -past_pers_vars <- c("NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", - "NPastDiff_pers_anxious", "NPastDiff_pers_complex") - -past_val_vars <- c("NPastDiff_val_obey", "NPastDiff_val_trad", "NPastDiff_val_opinion", - "NPastDiff_val_performance", "NPastDiff_val_justice") - -past_life_vars <- c("NPastDiff_life_ideal", "NPastDiff_life_excellent", "NPastDiff_life_satisfied", - "NPastDiff_life_important", "NPastDiff_life_change") - -# Function to check assumptions for a scale -check_assumptions <- function(data, var_names, scale_name) { - cat("\n", "="*60, "\n") - cat("ASSUMPTION CHECKS FOR:", scale_name, "\n") - cat("="*60, "\n") - - # Get scale data - scale_data <- data[, var_names] - - # 1. Sample size check - complete_cases <- sum(complete.cases(scale_data)) - cat("1. SAMPLE SIZE CHECK:\n") - cat(" Total participants:", nrow(data), "\n") - cat(" Complete cases:", complete_cases, "\n") - cat(" Adequate (≥30)?", ifelse(complete_cases >= 30, "✓ YES", "✗ NO"), "\n") - - if(complete_cases < 30) { - cat(" WARNING: Sample size too small for reliable alpha estimates\n") - return(FALSE) - } - - # 2. Missing data check - cat("\n2. MISSING DATA CHECK:\n") - missing_counts <- colSums(is.na(scale_data)) - missing_pct <- round(missing_counts / nrow(data) * 100, 2) - cat(" Missing data by item:\n") - for(i in 1:length(var_names)) { - cat(" ", var_names[i], ":", missing_counts[i], "(", missing_pct[i], "%)\n") - } - - max_missing <- max(missing_pct) - cat(" Maximum missing:", max_missing, "%\n") - cat(" Acceptable (<20%)?", ifelse(max_missing < 20, "✓ YES", "✗ NO"), "\n") - - # 3. Use only complete cases for remaining checks - complete_data <- scale_data[complete.cases(scale_data), ] - - # 4. Normality check (Shapiro-Wilk test on first item as example) - cat("\n3. NORMALITY CHECK (Shapiro-Wilk test on first item):\n") - if(nrow(complete_data) <= 5000) { # Shapiro-Wilk has sample size limit - shapiro_result <- shapiro.test(complete_data[, 1]) - cat(" p-value:", round(shapiro_result$p.value, 4), "\n") - cat(" Normal?", ifelse(shapiro_result$p.value > 0.05, "✓ YES", "✗ NO (but alpha is robust)"), "\n") - } else { - cat(" Sample too large for Shapiro-Wilk test (alpha is robust to non-normality)\n") - } - - # 5. Inter-item correlations check - cat("\n4. INTER-ITEM CORRELATIONS CHECK:\n") - cor_matrix <- cor(complete_data) - - # Get off-diagonal correlations - cor_matrix[lower.tri(cor_matrix)] <- NA - diag(cor_matrix) <- NA - cors <- as.vector(cor_matrix) - cors <- cors[!is.na(cors)] - - positive_cors <- sum(cors > 0) - strong_cors <- sum(cors > 0.30) - negative_cors <- sum(cors < 0) - - cat(" Total correlations:", length(cors), "\n") - cat(" Positive correlations:", positive_cors, "\n") - cat(" Strong correlations (>0.30):", strong_cors, "\n") - cat(" Negative correlations:", negative_cors, "\n") - cat(" Mean correlation:", round(mean(cors), 4), "\n") - cat(" Range:", round(min(cors), 4), "to", round(max(cors), 4), "\n") - - if(negative_cors > 0) { - cat(" ⚠️ WARNING: Negative correlations suggest potential issues\n") - } - if(strong_cors / length(cors) < 0.5) { - cat(" ⚠️ WARNING: Many weak correlations may indicate poor scale coherence\n") - } - - # 6. Item variance check - cat("\n5. ITEM VARIANCE CHECK:\n") - item_vars <- apply(complete_data, 2, var) - var_ratio <- max(item_vars) / min(item_vars) - cat(" Item variances:", round(item_vars, 4), "\n") - cat(" Variance ratio (max/min):", round(var_ratio, 4), "\n") - cat(" Acceptable (<4:1)?", ifelse(var_ratio < 4, "✓ YES", "✗ NO"), "\n") - - # 7. Outlier check - cat("\n6. OUTLIER CHECK:\n") - # Check for multivariate outliers using Mahalanobis distance - if(nrow(complete_data) > ncol(complete_data)) { - mahal_dist <- mahalanobis(complete_data, colMeans(complete_data), cov(complete_data)) - outlier_threshold <- qchisq(0.999, df = ncol(complete_data)) - outliers <- sum(mahal_dist > outlier_threshold) - cat(" Multivariate outliers (p<0.001):", outliers, "\n") - cat(" Acceptable (<5%)?", ifelse(outliers/nrow(complete_data) < 0.05, "✓ YES", "✗ NO"), "\n") - } - - # 8. Summary recommendation - cat("\n7. OVERALL RECOMMENDATION:\n") - issues <- 0 - if(complete_cases < 30) issues <- issues + 1 - if(max_missing >= 20) issues <- issues + 1 - if(negative_cors > 0) issues <- issues + 1 - if(var_ratio >= 4) issues <- issues + 1 - - if(issues == 0) { - cat(" ✓ PROCEED with Cronbach's alpha analysis\n") - } else if(issues <= 2) { - cat(" ⚠️ PROCEED with CAUTION - some assumptions violated\n") - } else { - cat(" ✗ CONSIDER alternatives or data cleaning before proceeding\n") - } - - return(TRUE) -} - -# Check assumptions for all past scales -cat("CRONBACH'S ALPHA ASSUMPTION CHECKS") -cat("\nData: exp1.csv") -cat("\nTotal sample size:", nrow(data)) - -check_assumptions(data, past_pref_vars, "Past Preferences") -check_assumptions(data, past_pers_vars, "Past Personality") -check_assumptions(data, past_val_vars, "Past Values") -check_assumptions(data, past_life_vars, "Past Life Satisfaction") - -# Quick check of future scales (you can expand this) -fut_pref_vars <- c("NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", - "NFutDiff_pref_nap", "NFutDiff_pref_travel") - -check_assumptions(data, fut_pref_vars, "Future Preferences") - -cat("\n", "="*60, "\n") -cat("GENERAL GUIDELINES:\n") -cat("="*60, "\n") -cat("✓ If most assumptions are met, Cronbach's alpha is appropriate\n") -cat("⚠️ If some assumptions are violated, interpret with caution\n") -cat("✗ If many assumptions are violated, consider alternative approaches:\n") -cat(" - Omega coefficient (more robust to violations)\n") -cat(" - Split-half reliability\n") -cat(" - Test-retest reliability\n") -cat(" - Factor analysis to check dimensionality\n") diff --git a/.history/eohi1/assumption_checks_before_cronbach_20250917154901.r b/.history/eohi1/assumption_checks_before_cronbach_20250917154901.r deleted file mode 100644 index 5dde602..0000000 --- a/.history/eohi1/assumption_checks_before_cronbach_20250917154901.r +++ /dev/null @@ -1,162 +0,0 @@ -# Assumption Checks Before Cronbach's Alpha Analysis -# Run this BEFORE the main reliability analysis - -library(psych) -library(corrplot) -library(ggplot2) - -# Read the data -data <- read.csv("exp1.csv") - -# Define scale variables -past_pref_vars <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", - "NPastDiff_pref_nap", "NPastDiff_pref_travel") - -past_pers_vars <- c("NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", - "NPastDiff_pers_anxious", "NPastDiff_pers_complex") - -past_val_vars <- c("NPastDiff_val_obey", "NPastDiff_val_trad", "NPastDiff_val_opinion", - "NPastDiff_val_performance", "NPastDiff_val_justice") - -past_life_vars <- c("NPastDiff_life_ideal", "NPastDiff_life_excellent", "NPastDiff_life_satisfied", - "NPastDiff_life_important", "NPastDiff_life_change") - -# Function to check assumptions for a scale -check_assumptions <- function(data, var_names, scale_name) { - cat("\n", "="*60, "\n") - cat("ASSUMPTION CHECKS FOR:", scale_name, "\n") - cat("="*60, "\n") - - # Get scale data - scale_data <- data[, var_names] - - # 1. Sample size check - complete_cases <- sum(complete.cases(scale_data)) - cat("1. SAMPLE SIZE CHECK:\n") - cat(" Total participants:", nrow(data), "\n") - cat(" Complete cases:", complete_cases, "\n") - cat(" Adequate (≥30)?", ifelse(complete_cases >= 30, "✓ YES", "✗ NO"), "\n") - - if(complete_cases < 30) { - cat(" WARNING: Sample size too small for reliable alpha estimates\n") - return(FALSE) - } - - # 2. Missing data check - cat("\n2. MISSING DATA CHECK:\n") - missing_counts <- colSums(is.na(scale_data)) - missing_pct <- round(missing_counts / nrow(data) * 100, 2) - cat(" Missing data by item:\n") - for(i in 1:length(var_names)) { - cat(" ", var_names[i], ":", missing_counts[i], "(", missing_pct[i], "%)\n") - } - - max_missing <- max(missing_pct) - cat(" Maximum missing:", max_missing, "%\n") - cat(" Acceptable (<20%)?", ifelse(max_missing < 20, "✓ YES", "✗ NO"), "\n") - - # 3. Use only complete cases for remaining checks - complete_data <- scale_data[complete.cases(scale_data), ] - - # 4. Normality check (Shapiro-Wilk test on first item as example) - cat("\n3. NORMALITY CHECK (Shapiro-Wilk test on first item):\n") - if(nrow(complete_data) <= 5000) { # Shapiro-Wilk has sample size limit - shapiro_result <- shapiro.test(complete_data[, 1]) - cat(" p-value:", round(shapiro_result$p.value, 4), "\n") - cat(" Normal?", ifelse(shapiro_result$p.value > 0.05, "✓ YES", "✗ NO (but alpha is robust)"), "\n") - } else { - cat(" Sample too large for Shapiro-Wilk test (alpha is robust to non-normality)\n") - } - - # 5. Inter-item correlations check - cat("\n4. INTER-ITEM CORRELATIONS CHECK:\n") - cor_matrix <- cor(complete_data) - - # Get off-diagonal correlations - cor_matrix[lower.tri(cor_matrix)] <- NA - diag(cor_matrix) <- NA - cors <- as.vector(cor_matrix) - cors <- cors[!is.na(cors)] - - positive_cors <- sum(cors > 0) - strong_cors <- sum(cors > 0.30) - negative_cors <- sum(cors < 0) - - cat(" Total correlations:", length(cors), "\n") - cat(" Positive correlations:", positive_cors, "\n") - cat(" Strong correlations (>0.30):", strong_cors, "\n") - cat(" Negative correlations:", negative_cors, "\n") - cat(" Mean correlation:", round(mean(cors), 4), "\n") - cat(" Range:", round(min(cors), 4), "to", round(max(cors), 4), "\n") - - if(negative_cors > 0) { - cat(" ⚠️ WARNING: Negative correlations suggest potential issues\n") - } - if(strong_cors / length(cors) < 0.5) { - cat(" ⚠️ WARNING: Many weak correlations may indicate poor scale coherence\n") - } - - # 6. Item variance check - cat("\n5. ITEM VARIANCE CHECK:\n") - item_vars <- apply(complete_data, 2, var) - var_ratio <- max(item_vars) / min(item_vars) - cat(" Item variances:", round(item_vars, 4), "\n") - cat(" Variance ratio (max/min):", round(var_ratio, 4), "\n") - cat(" Acceptable (<4:1)?", ifelse(var_ratio < 4, "✓ YES", "✗ NO"), "\n") - - # 7. Outlier check - cat("\n6. OUTLIER CHECK:\n") - # Check for multivariate outliers using Mahalanobis distance - if(nrow(complete_data) > ncol(complete_data)) { - mahal_dist <- mahalanobis(complete_data, colMeans(complete_data), cov(complete_data)) - outlier_threshold <- qchisq(0.999, df = ncol(complete_data)) - outliers <- sum(mahal_dist > outlier_threshold) - cat(" Multivariate outliers (p<0.001):", outliers, "\n") - cat(" Acceptable (<5%)?", ifelse(outliers/nrow(complete_data) < 0.05, "✓ YES", "✗ NO"), "\n") - } - - # 8. Summary recommendation - cat("\n7. OVERALL RECOMMENDATION:\n") - issues <- 0 - if(complete_cases < 30) issues <- issues + 1 - if(max_missing >= 20) issues <- issues + 1 - if(negative_cors > 0) issues <- issues + 1 - if(var_ratio >= 4) issues <- issues + 1 - - if(issues == 0) { - cat(" ✓ PROCEED with Cronbach's alpha analysis\n") - } else if(issues <= 2) { - cat(" ⚠️ PROCEED with CAUTION - some assumptions violated\n") - } else { - cat(" ✗ CONSIDER alternatives or data cleaning before proceeding\n") - } - - return(TRUE) -} - -# Check assumptions for all past scales -cat("CRONBACH'S ALPHA ASSUMPTION CHECKS") -cat("\nData: exp1.csv") -cat("\nTotal sample size:", nrow(data)) - -check_assumptions(data, past_pref_vars, "Past Preferences") -check_assumptions(data, past_pers_vars, "Past Personality") -check_assumptions(data, past_val_vars, "Past Values") -check_assumptions(data, past_life_vars, "Past Life Satisfaction") - -# Quick check of future scales (you can expand this) -fut_pref_vars <- c("NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", - "NFutDiff_pref_nap", "NFutDiff_pref_travel") - -check_assumptions(data, fut_pref_vars, "Future Preferences") - -cat("\n", "="*60, "\n") -cat("GENERAL GUIDELINES:\n") -cat("="*60, "\n") -cat("✓ If most assumptions are met, Cronbach's alpha is appropriate\n") -cat("⚠️ If some assumptions are violated, interpret with caution\n") -cat("✗ If many assumptions are violated, consider alternative approaches:\n") -cat(" - Omega coefficient (more robust to violations)\n") -cat(" - Split-half reliability\n") -cat(" - Test-retest reliability\n") -cat(" - Factor analysis to check dimensionality\n") diff --git a/.history/eohi1/assumption_checks_before_cronbach_20250918115459.r b/.history/eohi1/assumption_checks_before_cronbach_20250918115459.r deleted file mode 100644 index 3de74a7..0000000 --- a/.history/eohi1/assumption_checks_before_cronbach_20250918115459.r +++ /dev/null @@ -1,164 +0,0 @@ -# Assumption Checks Before Cronbach's Alpha Analysis -# Run this BEFORE the main reliability analysis - -library(psych) -library(corrplot) -library(ggplot2) - -# Read the data -data <- read.csv("exp1.csv") - -# Define scale variables -past_pref_vars <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", - "NPastDiff_pref_nap", "NPastDiff_pref_travel") - -past_pers_vars <- c("NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", - "NPastDiff_pers_anxious", "NPastDiff_pers_complex") - -past_val_vars <- c("NPastDiff_val_obey", "NPastDiff_val_trad", "NPastDiff_val_opinion", - "NPastDiff_val_performance", "NPastDiff_val_justice") - -past_life_vars <- c("NPastDiff_life_ideal", "NPastDiff_life_excellent", "NPastDiff_life_satisfied", - "NPastDiff_life_important", "NPastDiff_life_change") - -# Function to check assumptions for a scale -check_assumptions <- function(data, var_names, scale_name) { - cat("\n", "="*60, "\n") - cat("ASSUMPTION CHECKS FOR:", scale_name, "\n") - cat("="*60, "\n") - - # Get scale data - scale_data <- data[, var_names] - - # 1. Sample size check - complete_cases <- sum(complete.cases(scale_data)) - cat("1. SAMPLE SIZE CHECK:\n") - cat(" Total participants:", nrow(data), "\n") - cat(" Complete cases:", complete_cases, "\n") - cat(" Adequate (≥30)?", ifelse(complete_cases >= 30, "✓ YES", "✗ NO"), "\n") - - if(complete_cases < 30) { - cat(" WARNING: Sample size too small for reliable alpha estimates\n") - return(FALSE) - } - - # 2. Missing data check - cat("\n2. MISSING DATA CHECK:\n") - missing_counts <- colSums(is.na(scale_data)) - missing_pct <- round(missing_counts / nrow(data) * 100, 2) - cat(" Missing data by item:\n") - for(i in 1:length(var_names)) { - cat(" ", var_names[i], ":", missing_counts[i], "(", missing_pct[i], "%)\n") - } - - max_missing <- max(missing_pct) - cat(" Maximum missing:", max_missing, "%\n") - cat(" Acceptable (<20%)?", ifelse(max_missing < 20, "✓ YES", "✗ NO"), "\n") - - # 3. Use only complete cases for remaining checks - complete_data <- scale_data[complete.cases(scale_data), ] - - # 4. Normality check (Shapiro-Wilk test on first item as example) - cat("\n3. NORMALITY CHECK (Shapiro-Wilk test on first item):\n") - if(nrow(complete_data) <= 5000) { # Shapiro-Wilk has sample size limit - shapiro_result <- shapiro.test(complete_data[, 1]) - cat(" p-value:", round(shapiro_result$p.value, 4), "\n") - cat(" Normal?", ifelse(shapiro_result$p.value > 0.05, "✓ YES", "✗ NO (but alpha is robust)"), "\n") - } else { - cat(" Sample too large for Shapiro-Wilk test (alpha is robust to non-normality)\n") - } - - # 5. Inter-item correlations check - cat("\n4. INTER-ITEM CORRELATIONS CHECK:\n") - cor_matrix <- cor(complete_data) - - # Get off-diagonal correlations - cor_matrix[lower.tri(cor_matrix)] <- NA - diag(cor_matrix) <- NA - cors <- as.vector(cor_matrix) - cors <- cors[!is.na(cors)] - - positive_cors <- sum(cors > 0) - strong_cors <- sum(cors > 0.30) - negative_cors <- sum(cors < 0) - - cat(" Total correlations:", length(cors), "\n") - cat(" Positive correlations:", positive_cors, "\n") - cat(" Strong correlations (>0.30):", strong_cors, "\n") - cat(" Negative correlations:", negative_cors, "\n") - cat(" Mean correlation:", round(mean(cors), 4), "\n") - cat(" Range:", round(min(cors), 4), "to", round(max(cors), 4), "\n") - - if(negative_cors > 0) { - cat(" ⚠️ WARNING: Negative correlations suggest potential issues\n") - } - if(strong_cors / length(cors) < 0.5) { - cat(" ⚠️ WARNING: Many weak correlations may indicate poor scale coherence\n") - } - - # 6. Item variance check - cat("\n5. ITEM VARIANCE CHECK:\n") - item_vars <- apply(complete_data, 2, var) - var_ratio <- max(item_vars) / min(item_vars) - cat(" Item variances:", round(item_vars, 4), "\n") - cat(" Variance ratio (max/min):", round(var_ratio, 4), "\n") - cat(" Acceptable (<4:1)?", ifelse(var_ratio < 4, "✓ YES", "✗ NO"), "\n") - - # 7. Outlier check - cat("\n6. OUTLIER CHECK:\n") - # Check for multivariate outliers using Mahalanobis distance - if(nrow(complete_data) > ncol(complete_data)) { - mahal_dist <- mahalanobis(complete_data, colMeans(complete_data), cov(complete_data)) - outlier_threshold <- qchisq(0.999, df = ncol(complete_data)) - outliers <- sum(mahal_dist > outlier_threshold) - cat(" Multivariate outliers (p<0.001):", outliers, "\n") - cat(" Acceptable (<5%)?", ifelse(outliers/nrow(complete_data) < 0.05, "✓ YES", "✗ NO"), "\n") - } - - # 8. Summary recommendation - cat("\n7. OVERALL RECOMMENDATION:\n") - issues <- 0 - if(complete_cases < 30) issues <- issues + 1 - if(max_missing >= 20) issues <- issues + 1 - if(negative_cors > 0) issues <- issues + 1 - if(var_ratio >= 4) issues <- issues + 1 - - if(issues == 0) { - cat(" ✓ PROCEED with Cronbach's alpha analysis\n") - } else if(issues <= 2) { - cat(" ⚠️ PROCEED with CAUTION - some assumptions violated\n") - } else { - cat(" ✗ CONSIDER alternatives or data cleaning before proceeding\n") - } - - return(TRUE) -} - -# Check assumptions for all past scales -cat("CRONBACH'S ALPHA ASSUMPTION CHECKS") -cat("\nData: exp1.csv") -cat("\nTotal sample size:", nrow(data)) - -check_assumptions(data, past_pref_vars, "Past Preferences") -check_assumptions(data, past_pers_vars, "Past Personality") -check_assumptions(data, past_val_vars, "Past Values") -check_assumptions(data, past_life_vars, "Past Life Satisfaction") - -# Quick check of future scales (you can expand this) -fut_pref_vars <- c("NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", - "NFutDiff_pref_nap", "NFutDiff_pref_travel") - -check_assumptions(data, fut_pref_vars, "Future Preferences") - -cat("\n", "="*60, "\n") -cat("GENERAL GUIDELINES:\n") -cat("="*60, "\n") -cat("✓ If most assumptions are met, Cronbach's alpha is appropriate\n") -cat("⚠️ If some assumptions are violated, interpret with caution\n") -cat("✗ If many assumptions are violated, consider alternative approaches:\n") -cat(" - Omega coefficient (more robust to violations)\n") -cat(" - Split-half reliability\n") -cat(" - Test-retest reliability\n") -cat(" - Factor analysis to check dimensionality\n") - - diff --git a/.history/eohi1/brierVARS_20250922124859.vb b/.history/eohi1/brierVARS_20250922124859.vb deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi1/brierVARS_20250922124900.vb b/.history/eohi1/brierVARS_20250922124900.vb deleted file mode 100644 index 6f430be..0000000 --- a/.history/eohi1/brierVARS_20250922124900.vb +++ /dev/null @@ -1,82 +0,0 @@ -Function GetCol(headerName As String, ws As Worksheet) As Long - Dim lastCol As Long - lastCol = ws.Cells(1, ws.Columns.Count).End(xlToLeft).Column - On Error Resume Next - GetCol = Application.Match(headerName, ws.Range(ws.Cells(1, 1), ws.Cells(1, lastCol)), 0) - If IsError(GetCol) Then GetCol = 0 - On Error GoTo 0 -End Function - -Sub brierVARS() - Dim false_vars As Variant - Dim true_vars As Variant - false_vars = Array("moza_55_F_1", "moza_55_CON", "demo_15_F_1", "demo_15_CON", "hume_35_F_1", "hume_35_CON", "gulf_15_F_1", "gulf_15_CON", "memo_75_F_1", "memo_75_CON", "vitc_55_F_1", "vitc_55_CON", "hert_35_F_1", "hert_35_CON", "gees_55_F_1", "gees_55_CON", "gang_15_F_1", "gang_15_CON", "list_75_F_1", "list_75_CON", "mont_35_F_1", "mont_35_CON", "dwar_55_F_1", "dwar_55_CON", "pucc_15_F_1", "pucc_15_CON", "spee_75_F_1", "spee_75_CON", "lute_35_F_1", "lute_35_CON") - true_vars = Array("vaud_15_T_1", "vaud_15_CON", "oedi_35_T_1", "oedi_35_CON", "mons_55_T_1", "mons_55_CON", "gest_75_T_1", "gest_75_CON", "kabu_15_T_1", "kabu_15_CON", "sham_55_T_1", "sham_55_CON", "pana_35_T_1", "pana_35_CON", "bohr_15_T_1", "bohr_15_CON", "chur_75_T_1", "chur_75_CON", "carb_35_T_1", "carb_35_CON", "cons_55_T_1", "cons_55_CON", "papy_75_T_1", "papy_75_CON", "dors_55_T_1", "dors_55_CON", "tsun_75_T_1", "tsun_75_CON", "troy_15_T_1", "troy_15_CON", "lock_35_T_1", "lock_35_CON") - Dim target_headers As Variant - target_headers = Array("gest_T_ex", "dors_T_ex", "chur_T_ex", "mons_T_ex", "lock_T_easy", "hume_F_easy", "papy_T_easy", "sham_T_easy", "list_F_easy", "cons_T_easy", "tsun_T_easy", "pana_T_easy", "kabu_T_easy", "gulf_F_easy", "oedi_T_easy", "vaud_T_easy", "mont_F_easy", "demo_F_easy", "spee_F_hard", "dwar_F_hard", "carb_T_hard", "bohr_T_hard", "gang_F_hard", "vitc_F_hard", "hert_F_hard", "pucc_F_hard", "troy_T_hard", "moza_F_hard", "croc_F_hard", "gees_F_hard", "lute_F_hard", "memo_F_hard") - Dim ws As Worksheet - Set ws = ThisWorkbook.Sheets(1) - Dim i As Long, rowCount As Long, colSource1 As Long, colSourceCON As Long, colTarget As Long - Dim srcVar As String, matchPrefix As String, checkVal As String, val As Double - Dim result As Variant - Dim r As Long, j As Long - rowCount = ws.Cells(ws.Rows.Count, "A").End(xlUp).Row - For i = 0 To UBound(false_vars) Step 2 - srcVar = false_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(false_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = ws.Cells(r, colSourceCON).Value - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - If checkVal = "TRUE" Then - result = ((val / 100) - 0) ^ 2 - ElseIf checkVal = "FALSE" Then - result = ((1 - (val / 100)) - 0) ^ 2 - Else - result = CVErr(xlErrNA) - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i - For i = 0 To UBound(true_vars) Step 2 - srcVar = true_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(true_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = ws.Cells(r, colSourceCON).Value - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - If checkVal = "TRUE" Then - result = ((val / 100) - 1) ^ 2 - ElseIf checkVal = "FALSE" Then - result = ((1 - (val / 100)) - 1) ^ 2 - Else - result = CVErr(xlErrNA) - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i -End Sub diff --git a/.history/eohi1/brierVARS_20250922125338.vb b/.history/eohi1/brierVARS_20250922125338.vb deleted file mode 100644 index 1ae6638..0000000 --- a/.history/eohi1/brierVARS_20250922125338.vb +++ /dev/null @@ -1,87 +0,0 @@ -Function GetCol(headerName As String, ws As Worksheet) As Long - Dim lastCol As Long - lastCol = ws.Cells(1, ws.Columns.Count).End(xlToLeft).Column - On Error Resume Next - Dim m As Variant - m = Application.Match(headerName, ws.Range(ws.Cells(1, 1), ws.Cells(1, lastCol)), 0) - On Error GoTo 0 - If IsError(m) Then - GetCol = 0 - Else - GetCol = CLng(m) - End If -End Function - -Sub brierVARS() - Dim false_vars As Variant - Dim true_vars As Variant - false_vars = Array("moza_55_F_1", "moza_55_CON", "demo_15_F_1", "demo_15_CON", "hume_35_F_1", "hume_35_CON", "gulf_15_F_1", "gulf_15_CON", "memo_75_F_1", "memo_75_CON", "vitc_55_F_1", "vitc_55_CON", "hert_35_F_1", "hert_35_CON", "gees_55_F_1", "gees_55_CON", "gang_15_F_1", "gang_15_CON", "list_75_F_1", "list_75_CON", "mont_35_F_1", "mont_35_CON", "dwar_55_F_1", "dwar_55_CON", "pucc_15_F_1", "pucc_15_CON", "spee_75_F_1", "spee_75_CON", "lute_35_F_1", "lute_35_CON") - true_vars = Array("vaud_15_T_1", "vaud_15_CON", "oedi_35_T_1", "oedi_35_CON", "mons_55_T_1", "mons_55_CON", "gest_75_T_1", "gest_75_CON", "kabu_15_T_1", "kabu_15_CON", "sham_55_T_1", "sham_55_CON", "pana_35_T_1", "pana_35_CON", "bohr_15_T_1", "bohr_15_CON", "chur_75_T_1", "chur_75_CON", "carb_35_T_1", "carb_35_CON", "cons_55_T_1", "cons_55_CON", "papy_75_T_1", "papy_75_CON", "dors_55_T_1", "dors_55_CON", "tsun_75_T_1", "tsun_75_CON", "troy_15_T_1", "troy_15_CON", "lock_35_T_1", "lock_35_CON") - Dim target_headers As Variant - target_headers = Array("gest_T_ex", "dors_T_ex", "chur_T_ex", "mons_T_ex", "lock_T_easy", "hume_F_easy", "papy_T_easy", "sham_T_easy", "list_F_easy", "cons_T_easy", "tsun_T_easy", "pana_T_easy", "kabu_T_easy", "gulf_F_easy", "oedi_T_easy", "vaud_T_easy", "mont_F_easy", "demo_F_easy", "spee_F_hard", "dwar_F_hard", "carb_T_hard", "bohr_T_hard", "gang_F_hard", "vitc_F_hard", "hert_F_hard", "pucc_F_hard", "troy_T_hard", "moza_F_hard", "croc_F_hard", "gees_F_hard", "lute_F_hard", "memo_F_hard") - Dim ws As Worksheet - Set ws = ThisWorkbook.Sheets(1) - Dim i As Long, rowCount As Long, colSource1 As Long, colSourceCON As Long, colTarget As Long - Dim srcVar As String, matchPrefix As String, checkVal As String, val As Double - Dim result As Variant - Dim r As Long, j As Long - rowCount = ws.Cells(ws.Rows.Count, "A").End(xlUp).Row - For i = 0 To UBound(false_vars) Step 2 - srcVar = false_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(false_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = ws.Cells(r, colSourceCON).Value - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - If checkVal = "TRUE" Then - result = ((val / 100) - 0) ^ 2 - ElseIf checkVal = "FALSE" Then - result = ((1 - (val / 100)) - 0) ^ 2 - Else - result = CVErr(xlErrNA) - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i - For i = 0 To UBound(true_vars) Step 2 - srcVar = true_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(true_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = ws.Cells(r, colSourceCON).Value - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - If checkVal = "TRUE" Then - result = ((val / 100) - 1) ^ 2 - ElseIf checkVal = "FALSE" Then - result = ((1 - (val / 100)) - 1) ^ 2 - Else - result = CVErr(xlErrNA) - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i -End Sub diff --git a/.history/eohi1/brierVARS_20250922125353.vb b/.history/eohi1/brierVARS_20250922125353.vb deleted file mode 100644 index 1ae6638..0000000 --- a/.history/eohi1/brierVARS_20250922125353.vb +++ /dev/null @@ -1,87 +0,0 @@ -Function GetCol(headerName As String, ws As Worksheet) As Long - Dim lastCol As Long - lastCol = ws.Cells(1, ws.Columns.Count).End(xlToLeft).Column - On Error Resume Next - Dim m As Variant - m = Application.Match(headerName, ws.Range(ws.Cells(1, 1), ws.Cells(1, lastCol)), 0) - On Error GoTo 0 - If IsError(m) Then - GetCol = 0 - Else - GetCol = CLng(m) - End If -End Function - -Sub brierVARS() - Dim false_vars As Variant - Dim true_vars As Variant - false_vars = Array("moza_55_F_1", "moza_55_CON", "demo_15_F_1", "demo_15_CON", "hume_35_F_1", "hume_35_CON", "gulf_15_F_1", "gulf_15_CON", "memo_75_F_1", "memo_75_CON", "vitc_55_F_1", "vitc_55_CON", "hert_35_F_1", "hert_35_CON", "gees_55_F_1", "gees_55_CON", "gang_15_F_1", "gang_15_CON", "list_75_F_1", "list_75_CON", "mont_35_F_1", "mont_35_CON", "dwar_55_F_1", "dwar_55_CON", "pucc_15_F_1", "pucc_15_CON", "spee_75_F_1", "spee_75_CON", "lute_35_F_1", "lute_35_CON") - true_vars = Array("vaud_15_T_1", "vaud_15_CON", "oedi_35_T_1", "oedi_35_CON", "mons_55_T_1", "mons_55_CON", "gest_75_T_1", "gest_75_CON", "kabu_15_T_1", "kabu_15_CON", "sham_55_T_1", "sham_55_CON", "pana_35_T_1", "pana_35_CON", "bohr_15_T_1", "bohr_15_CON", "chur_75_T_1", "chur_75_CON", "carb_35_T_1", "carb_35_CON", "cons_55_T_1", "cons_55_CON", "papy_75_T_1", "papy_75_CON", "dors_55_T_1", "dors_55_CON", "tsun_75_T_1", "tsun_75_CON", "troy_15_T_1", "troy_15_CON", "lock_35_T_1", "lock_35_CON") - Dim target_headers As Variant - target_headers = Array("gest_T_ex", "dors_T_ex", "chur_T_ex", "mons_T_ex", "lock_T_easy", "hume_F_easy", "papy_T_easy", "sham_T_easy", "list_F_easy", "cons_T_easy", "tsun_T_easy", "pana_T_easy", "kabu_T_easy", "gulf_F_easy", "oedi_T_easy", "vaud_T_easy", "mont_F_easy", "demo_F_easy", "spee_F_hard", "dwar_F_hard", "carb_T_hard", "bohr_T_hard", "gang_F_hard", "vitc_F_hard", "hert_F_hard", "pucc_F_hard", "troy_T_hard", "moza_F_hard", "croc_F_hard", "gees_F_hard", "lute_F_hard", "memo_F_hard") - Dim ws As Worksheet - Set ws = ThisWorkbook.Sheets(1) - Dim i As Long, rowCount As Long, colSource1 As Long, colSourceCON As Long, colTarget As Long - Dim srcVar As String, matchPrefix As String, checkVal As String, val As Double - Dim result As Variant - Dim r As Long, j As Long - rowCount = ws.Cells(ws.Rows.Count, "A").End(xlUp).Row - For i = 0 To UBound(false_vars) Step 2 - srcVar = false_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(false_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = ws.Cells(r, colSourceCON).Value - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - If checkVal = "TRUE" Then - result = ((val / 100) - 0) ^ 2 - ElseIf checkVal = "FALSE" Then - result = ((1 - (val / 100)) - 0) ^ 2 - Else - result = CVErr(xlErrNA) - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i - For i = 0 To UBound(true_vars) Step 2 - srcVar = true_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(true_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = ws.Cells(r, colSourceCON).Value - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - If checkVal = "TRUE" Then - result = ((val / 100) - 1) ^ 2 - ElseIf checkVal = "FALSE" Then - result = ((1 - (val / 100)) - 1) ^ 2 - Else - result = CVErr(xlErrNA) - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i -End Sub diff --git a/.history/eohi1/brierVARS_20250922125648.vb b/.history/eohi1/brierVARS_20250922125648.vb deleted file mode 100644 index 1ae6638..0000000 --- a/.history/eohi1/brierVARS_20250922125648.vb +++ /dev/null @@ -1,87 +0,0 @@ -Function GetCol(headerName As String, ws As Worksheet) As Long - Dim lastCol As Long - lastCol = ws.Cells(1, ws.Columns.Count).End(xlToLeft).Column - On Error Resume Next - Dim m As Variant - m = Application.Match(headerName, ws.Range(ws.Cells(1, 1), ws.Cells(1, lastCol)), 0) - On Error GoTo 0 - If IsError(m) Then - GetCol = 0 - Else - GetCol = CLng(m) - End If -End Function - -Sub brierVARS() - Dim false_vars As Variant - Dim true_vars As Variant - false_vars = Array("moza_55_F_1", "moza_55_CON", "demo_15_F_1", "demo_15_CON", "hume_35_F_1", "hume_35_CON", "gulf_15_F_1", "gulf_15_CON", "memo_75_F_1", "memo_75_CON", "vitc_55_F_1", "vitc_55_CON", "hert_35_F_1", "hert_35_CON", "gees_55_F_1", "gees_55_CON", "gang_15_F_1", "gang_15_CON", "list_75_F_1", "list_75_CON", "mont_35_F_1", "mont_35_CON", "dwar_55_F_1", "dwar_55_CON", "pucc_15_F_1", "pucc_15_CON", "spee_75_F_1", "spee_75_CON", "lute_35_F_1", "lute_35_CON") - true_vars = Array("vaud_15_T_1", "vaud_15_CON", "oedi_35_T_1", "oedi_35_CON", "mons_55_T_1", "mons_55_CON", "gest_75_T_1", "gest_75_CON", "kabu_15_T_1", "kabu_15_CON", "sham_55_T_1", "sham_55_CON", "pana_35_T_1", "pana_35_CON", "bohr_15_T_1", "bohr_15_CON", "chur_75_T_1", "chur_75_CON", "carb_35_T_1", "carb_35_CON", "cons_55_T_1", "cons_55_CON", "papy_75_T_1", "papy_75_CON", "dors_55_T_1", "dors_55_CON", "tsun_75_T_1", "tsun_75_CON", "troy_15_T_1", "troy_15_CON", "lock_35_T_1", "lock_35_CON") - Dim target_headers As Variant - target_headers = Array("gest_T_ex", "dors_T_ex", "chur_T_ex", "mons_T_ex", "lock_T_easy", "hume_F_easy", "papy_T_easy", "sham_T_easy", "list_F_easy", "cons_T_easy", "tsun_T_easy", "pana_T_easy", "kabu_T_easy", "gulf_F_easy", "oedi_T_easy", "vaud_T_easy", "mont_F_easy", "demo_F_easy", "spee_F_hard", "dwar_F_hard", "carb_T_hard", "bohr_T_hard", "gang_F_hard", "vitc_F_hard", "hert_F_hard", "pucc_F_hard", "troy_T_hard", "moza_F_hard", "croc_F_hard", "gees_F_hard", "lute_F_hard", "memo_F_hard") - Dim ws As Worksheet - Set ws = ThisWorkbook.Sheets(1) - Dim i As Long, rowCount As Long, colSource1 As Long, colSourceCON As Long, colTarget As Long - Dim srcVar As String, matchPrefix As String, checkVal As String, val As Double - Dim result As Variant - Dim r As Long, j As Long - rowCount = ws.Cells(ws.Rows.Count, "A").End(xlUp).Row - For i = 0 To UBound(false_vars) Step 2 - srcVar = false_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(false_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = ws.Cells(r, colSourceCON).Value - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - If checkVal = "TRUE" Then - result = ((val / 100) - 0) ^ 2 - ElseIf checkVal = "FALSE" Then - result = ((1 - (val / 100)) - 0) ^ 2 - Else - result = CVErr(xlErrNA) - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i - For i = 0 To UBound(true_vars) Step 2 - srcVar = true_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(true_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = ws.Cells(r, colSourceCON).Value - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - If checkVal = "TRUE" Then - result = ((val / 100) - 1) ^ 2 - ElseIf checkVal = "FALSE" Then - result = ((1 - (val / 100)) - 1) ^ 2 - Else - result = CVErr(xlErrNA) - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i -End Sub diff --git a/.history/eohi1/brierVARS_20250922125759.vb b/.history/eohi1/brierVARS_20250922125759.vb deleted file mode 100644 index e39d631..0000000 --- a/.history/eohi1/brierVARS_20250922125759.vb +++ /dev/null @@ -1,87 +0,0 @@ -Function GetCol(ByVal headerName As String, ByVal ws As Worksheet) As Long - Dim lastCol As Long - lastCol = ws.Cells(1, ws.Columns.Count).End(xlToLeft).Column - On Error Resume Next - Dim m As Variant - m = Application.Match(headerName, ws.Range(ws.Cells(1, 1), ws.Cells(1, lastCol)), 0) - On Error GoTo 0 - If IsError(m) Then - GetCol = 0 - Else - GetCol = CLng(m) - End If -End Function - -Sub brierVARS() - Dim false_vars As Variant - Dim true_vars As Variant - false_vars = Array("moza_55_F_1", "moza_55_CON", "demo_15_F_1", "demo_15_CON", "hume_35_F_1", "hume_35_CON", "gulf_15_F_1", "gulf_15_CON", "memo_75_F_1", "memo_75_CON", "vitc_55_F_1", "vitc_55_CON", "hert_35_F_1", "hert_35_CON", "gees_55_F_1", "gees_55_CON", "gang_15_F_1", "gang_15_CON", "list_75_F_1", "list_75_CON", "mont_35_F_1", "mont_35_CON", "dwar_55_F_1", "dwar_55_CON", "pucc_15_F_1", "pucc_15_CON", "spee_75_F_1", "spee_75_CON", "lute_35_F_1", "lute_35_CON") - true_vars = Array("vaud_15_T_1", "vaud_15_CON", "oedi_35_T_1", "oedi_35_CON", "mons_55_T_1", "mons_55_CON", "gest_75_T_1", "gest_75_CON", "kabu_15_T_1", "kabu_15_CON", "sham_55_T_1", "sham_55_CON", "pana_35_T_1", "pana_35_CON", "bohr_15_T_1", "bohr_15_CON", "chur_75_T_1", "chur_75_CON", "carb_35_T_1", "carb_35_CON", "cons_55_T_1", "cons_55_CON", "papy_75_T_1", "papy_75_CON", "dors_55_T_1", "dors_55_CON", "tsun_75_T_1", "tsun_75_CON", "troy_15_T_1", "troy_15_CON", "lock_35_T_1", "lock_35_CON") - Dim target_headers As Variant - target_headers = Array("gest_T_ex", "dors_T_ex", "chur_T_ex", "mons_T_ex", "lock_T_easy", "hume_F_easy", "papy_T_easy", "sham_T_easy", "list_F_easy", "cons_T_easy", "tsun_T_easy", "pana_T_easy", "kabu_T_easy", "gulf_F_easy", "oedi_T_easy", "vaud_T_easy", "mont_F_easy", "demo_F_easy", "spee_F_hard", "dwar_F_hard", "carb_T_hard", "bohr_T_hard", "gang_F_hard", "vitc_F_hard", "hert_F_hard", "pucc_F_hard", "troy_T_hard", "moza_F_hard", "croc_F_hard", "gees_F_hard", "lute_F_hard", "memo_F_hard") - Dim ws As Worksheet - Set ws = ThisWorkbook.Sheets(1) - Dim i As Long, rowCount As Long, colSource1 As Long, colSourceCON As Long, colTarget As Long - Dim srcVar As String, matchPrefix As String, checkVal As String, val As Double - Dim result As Variant - Dim r As Long, j As Long - rowCount = ws.Cells(ws.Rows.Count, "A").End(xlUp).Row - For i = 0 To UBound(false_vars) Step 2 - srcVar = false_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(false_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = ws.Cells(r, colSourceCON).Value - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - If checkVal = "TRUE" Then - result = ((val / 100) - 0) ^ 2 - ElseIf checkVal = "FALSE" Then - result = ((1 - (val / 100)) - 0) ^ 2 - Else - result = CVErr(xlErrNA) - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i - For i = 0 To UBound(true_vars) Step 2 - srcVar = true_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(true_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = ws.Cells(r, colSourceCON).Value - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - If checkVal = "TRUE" Then - result = ((val / 100) - 1) ^ 2 - ElseIf checkVal = "FALSE" Then - result = ((1 - (val / 100)) - 1) ^ 2 - Else - result = CVErr(xlErrNA) - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i -End Sub diff --git a/.history/eohi1/brierVARS_20250922125807.vb b/.history/eohi1/brierVARS_20250922125807.vb deleted file mode 100644 index e39d631..0000000 --- a/.history/eohi1/brierVARS_20250922125807.vb +++ /dev/null @@ -1,87 +0,0 @@ -Function GetCol(ByVal headerName As String, ByVal ws As Worksheet) As Long - Dim lastCol As Long - lastCol = ws.Cells(1, ws.Columns.Count).End(xlToLeft).Column - On Error Resume Next - Dim m As Variant - m = Application.Match(headerName, ws.Range(ws.Cells(1, 1), ws.Cells(1, lastCol)), 0) - On Error GoTo 0 - If IsError(m) Then - GetCol = 0 - Else - GetCol = CLng(m) - End If -End Function - -Sub brierVARS() - Dim false_vars As Variant - Dim true_vars As Variant - false_vars = Array("moza_55_F_1", "moza_55_CON", "demo_15_F_1", "demo_15_CON", "hume_35_F_1", "hume_35_CON", "gulf_15_F_1", "gulf_15_CON", "memo_75_F_1", "memo_75_CON", "vitc_55_F_1", "vitc_55_CON", "hert_35_F_1", "hert_35_CON", "gees_55_F_1", "gees_55_CON", "gang_15_F_1", "gang_15_CON", "list_75_F_1", "list_75_CON", "mont_35_F_1", "mont_35_CON", "dwar_55_F_1", "dwar_55_CON", "pucc_15_F_1", "pucc_15_CON", "spee_75_F_1", "spee_75_CON", "lute_35_F_1", "lute_35_CON") - true_vars = Array("vaud_15_T_1", "vaud_15_CON", "oedi_35_T_1", "oedi_35_CON", "mons_55_T_1", "mons_55_CON", "gest_75_T_1", "gest_75_CON", "kabu_15_T_1", "kabu_15_CON", "sham_55_T_1", "sham_55_CON", "pana_35_T_1", "pana_35_CON", "bohr_15_T_1", "bohr_15_CON", "chur_75_T_1", "chur_75_CON", "carb_35_T_1", "carb_35_CON", "cons_55_T_1", "cons_55_CON", "papy_75_T_1", "papy_75_CON", "dors_55_T_1", "dors_55_CON", "tsun_75_T_1", "tsun_75_CON", "troy_15_T_1", "troy_15_CON", "lock_35_T_1", "lock_35_CON") - Dim target_headers As Variant - target_headers = Array("gest_T_ex", "dors_T_ex", "chur_T_ex", "mons_T_ex", "lock_T_easy", "hume_F_easy", "papy_T_easy", "sham_T_easy", "list_F_easy", "cons_T_easy", "tsun_T_easy", "pana_T_easy", "kabu_T_easy", "gulf_F_easy", "oedi_T_easy", "vaud_T_easy", "mont_F_easy", "demo_F_easy", "spee_F_hard", "dwar_F_hard", "carb_T_hard", "bohr_T_hard", "gang_F_hard", "vitc_F_hard", "hert_F_hard", "pucc_F_hard", "troy_T_hard", "moza_F_hard", "croc_F_hard", "gees_F_hard", "lute_F_hard", "memo_F_hard") - Dim ws As Worksheet - Set ws = ThisWorkbook.Sheets(1) - Dim i As Long, rowCount As Long, colSource1 As Long, colSourceCON As Long, colTarget As Long - Dim srcVar As String, matchPrefix As String, checkVal As String, val As Double - Dim result As Variant - Dim r As Long, j As Long - rowCount = ws.Cells(ws.Rows.Count, "A").End(xlUp).Row - For i = 0 To UBound(false_vars) Step 2 - srcVar = false_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(false_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = ws.Cells(r, colSourceCON).Value - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - If checkVal = "TRUE" Then - result = ((val / 100) - 0) ^ 2 - ElseIf checkVal = "FALSE" Then - result = ((1 - (val / 100)) - 0) ^ 2 - Else - result = CVErr(xlErrNA) - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i - For i = 0 To UBound(true_vars) Step 2 - srcVar = true_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(true_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = ws.Cells(r, colSourceCON).Value - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - If checkVal = "TRUE" Then - result = ((val / 100) - 1) ^ 2 - ElseIf checkVal = "FALSE" Then - result = ((1 - (val / 100)) - 1) ^ 2 - Else - result = CVErr(xlErrNA) - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i -End Sub diff --git a/.history/eohi1/brierVARS_20250922125821.vb b/.history/eohi1/brierVARS_20250922125821.vb deleted file mode 100644 index e39d631..0000000 --- a/.history/eohi1/brierVARS_20250922125821.vb +++ /dev/null @@ -1,87 +0,0 @@ -Function GetCol(ByVal headerName As String, ByVal ws As Worksheet) As Long - Dim lastCol As Long - lastCol = ws.Cells(1, ws.Columns.Count).End(xlToLeft).Column - On Error Resume Next - Dim m As Variant - m = Application.Match(headerName, ws.Range(ws.Cells(1, 1), ws.Cells(1, lastCol)), 0) - On Error GoTo 0 - If IsError(m) Then - GetCol = 0 - Else - GetCol = CLng(m) - End If -End Function - -Sub brierVARS() - Dim false_vars As Variant - Dim true_vars As Variant - false_vars = Array("moza_55_F_1", "moza_55_CON", "demo_15_F_1", "demo_15_CON", "hume_35_F_1", "hume_35_CON", "gulf_15_F_1", "gulf_15_CON", "memo_75_F_1", "memo_75_CON", "vitc_55_F_1", "vitc_55_CON", "hert_35_F_1", "hert_35_CON", "gees_55_F_1", "gees_55_CON", "gang_15_F_1", "gang_15_CON", "list_75_F_1", "list_75_CON", "mont_35_F_1", "mont_35_CON", "dwar_55_F_1", "dwar_55_CON", "pucc_15_F_1", "pucc_15_CON", "spee_75_F_1", "spee_75_CON", "lute_35_F_1", "lute_35_CON") - true_vars = Array("vaud_15_T_1", "vaud_15_CON", "oedi_35_T_1", "oedi_35_CON", "mons_55_T_1", "mons_55_CON", "gest_75_T_1", "gest_75_CON", "kabu_15_T_1", "kabu_15_CON", "sham_55_T_1", "sham_55_CON", "pana_35_T_1", "pana_35_CON", "bohr_15_T_1", "bohr_15_CON", "chur_75_T_1", "chur_75_CON", "carb_35_T_1", "carb_35_CON", "cons_55_T_1", "cons_55_CON", "papy_75_T_1", "papy_75_CON", "dors_55_T_1", "dors_55_CON", "tsun_75_T_1", "tsun_75_CON", "troy_15_T_1", "troy_15_CON", "lock_35_T_1", "lock_35_CON") - Dim target_headers As Variant - target_headers = Array("gest_T_ex", "dors_T_ex", "chur_T_ex", "mons_T_ex", "lock_T_easy", "hume_F_easy", "papy_T_easy", "sham_T_easy", "list_F_easy", "cons_T_easy", "tsun_T_easy", "pana_T_easy", "kabu_T_easy", "gulf_F_easy", "oedi_T_easy", "vaud_T_easy", "mont_F_easy", "demo_F_easy", "spee_F_hard", "dwar_F_hard", "carb_T_hard", "bohr_T_hard", "gang_F_hard", "vitc_F_hard", "hert_F_hard", "pucc_F_hard", "troy_T_hard", "moza_F_hard", "croc_F_hard", "gees_F_hard", "lute_F_hard", "memo_F_hard") - Dim ws As Worksheet - Set ws = ThisWorkbook.Sheets(1) - Dim i As Long, rowCount As Long, colSource1 As Long, colSourceCON As Long, colTarget As Long - Dim srcVar As String, matchPrefix As String, checkVal As String, val As Double - Dim result As Variant - Dim r As Long, j As Long - rowCount = ws.Cells(ws.Rows.Count, "A").End(xlUp).Row - For i = 0 To UBound(false_vars) Step 2 - srcVar = false_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(false_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = ws.Cells(r, colSourceCON).Value - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - If checkVal = "TRUE" Then - result = ((val / 100) - 0) ^ 2 - ElseIf checkVal = "FALSE" Then - result = ((1 - (val / 100)) - 0) ^ 2 - Else - result = CVErr(xlErrNA) - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i - For i = 0 To UBound(true_vars) Step 2 - srcVar = true_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(true_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = ws.Cells(r, colSourceCON).Value - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - If checkVal = "TRUE" Then - result = ((val / 100) - 1) ^ 2 - ElseIf checkVal = "FALSE" Then - result = ((1 - (val / 100)) - 1) ^ 2 - Else - result = CVErr(xlErrNA) - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i -End Sub diff --git a/.history/eohi1/brierVARS_20250922130226.vb b/.history/eohi1/brierVARS_20250922130226.vb deleted file mode 100644 index 1192e2f..0000000 --- a/.history/eohi1/brierVARS_20250922130226.vb +++ /dev/null @@ -1,141 +0,0 @@ -Function GetCol(ByVal headerName As String, ByVal ws As Worksheet) As Long - Dim lastCol As Long - lastCol = ws.Cells(1, ws.Columns.Count).End(xlToLeft).Column - On Error Resume Next - Dim m As Variant - m = Application.Match(headerName, ws.Range(ws.Cells(1, 1), ws.Cells(1, lastCol)), 0) - On Error GoTo 0 - If IsError(m) Then - GetCol = 0 - Else - GetCol = CLng(m) - End If -End Function - -Private Function NormalizeTruth(ByVal v As Variant) As Variant - ' Returns True/False or Null if cannot determine - If VarType(v) = vbBoolean Then - NormalizeTruth = v - Exit Function - End If - If IsError(v) Or IsEmpty(v) Then - NormalizeTruth = Null - Exit Function - End If - Dim s As String - s = Trim$(UCase$(CStr(v))) - Select Case s - Case "TRUE", "T", "1", "YES", "Y" - NormalizeTruth = True - Case "FALSE", "F", "0", "NO", "N" - NormalizeTruth = False - Case Else - NormalizeTruth = Null - End Select -End Function - -Private Function NormalizeProb01(ByVal v As Variant) As Double - ' Converts confidence values to [0,1] - If IsError(v) Or IsEmpty(v) Then - NormalizeProb01 = -1 - Exit Function - End If - Dim s As String - s = CStr(v) - If InStr(s, "%") > 0 Then - s = Replace$(s, "%", "") - If IsNumeric(s) Then - NormalizeProb01 = CDbl(s) / 100# - Exit Function - End If - End If - If IsNumeric(v) Then - Dim d As Double - d = CDbl(v) - If d > 1# Then - NormalizeProb01 = d / 100# - Else - NormalizeProb01 = d - End If - Else - NormalizeProb01 = -1 - End If -End Function - -Sub brierVARS() - Dim false_vars As Variant - Dim true_vars As Variant - false_vars = Array("moza_55_F_1", "moza_55_CON", "demo_15_F_1", "demo_15_CON", "hume_35_F_1", "hume_35_CON", "gulf_15_F_1", "gulf_15_CON", "memo_75_F_1", "memo_75_CON", "vitc_55_F_1", "vitc_55_CON", "hert_35_F_1", "hert_35_CON", "gees_55_F_1", "gees_55_CON", "gang_15_F_1", "gang_15_CON", "list_75_F_1", "list_75_CON", "mont_35_F_1", "mont_35_CON", "dwar_55_F_1", "dwar_55_CON", "pucc_15_F_1", "pucc_15_CON", "spee_75_F_1", "spee_75_CON", "lute_35_F_1", "lute_35_CON") - true_vars = Array("vaud_15_T_1", "vaud_15_CON", "oedi_35_T_1", "oedi_35_CON", "mons_55_T_1", "mons_55_CON", "gest_75_T_1", "gest_75_CON", "kabu_15_T_1", "kabu_15_CON", "sham_55_T_1", "sham_55_CON", "pana_35_T_1", "pana_35_CON", "bohr_15_T_1", "bohr_15_CON", "chur_75_T_1", "chur_75_CON", "carb_35_T_1", "carb_35_CON", "cons_55_T_1", "cons_55_CON", "papy_75_T_1", "papy_75_CON", "dors_55_T_1", "dors_55_CON", "tsun_75_T_1", "tsun_75_CON", "troy_15_T_1", "troy_15_CON", "lock_35_T_1", "lock_35_CON") - Dim target_headers As Variant - target_headers = Array("gest_T_ex", "dors_T_ex", "chur_T_ex", "mons_T_ex", "lock_T_easy", "hume_F_easy", "papy_T_easy", "sham_T_easy", "list_F_easy", "cons_T_easy", "tsun_T_easy", "pana_T_easy", "kabu_T_easy", "gulf_F_easy", "oedi_T_easy", "vaud_T_easy", "mont_F_easy", "demo_F_easy", "spee_F_hard", "dwar_F_hard", "carb_T_hard", "bohr_T_hard", "gang_F_hard", "vitc_F_hard", "hert_F_hard", "pucc_F_hard", "troy_T_hard", "moza_F_hard", "croc_F_hard", "gees_F_hard", "lute_F_hard", "memo_F_hard") - Dim ws As Worksheet - Set ws = ThisWorkbook.Sheets(1) - Dim i As Long, rowCount As Long, colSource1 As Long, colSourceCON As Long, colTarget As Long - Dim srcVar As String, matchPrefix As String, checkVal As String, val As Double - Dim result As Variant - Dim r As Long, j As Long - rowCount = ws.Cells(ws.Rows.Count, "A").End(xlUp).Row - For i = 0 To UBound(false_vars) Step 2 - srcVar = false_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(false_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = NormalizeProb01(ws.Cells(r, colSourceCON).Value) - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - Dim truth As Variant - truth = NormalizeTruth(checkVal) - If IsNull(truth) Or val < 0 Then - result = CVErr(xlErrNA) - ElseIf truth = True Then - result = (val - 0#) ^ 2 - Else - result = ((1# - val) - 0#) ^ 2 - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i - For i = 0 To UBound(true_vars) Step 2 - srcVar = true_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(true_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = NormalizeProb01(ws.Cells(r, colSourceCON).Value) - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - Dim truth2 As Variant - truth2 = NormalizeTruth(checkVal) - If IsNull(truth2) Or val < 0 Then - result = CVErr(xlErrNA) - ElseIf truth2 = True Then - result = (val - 1#) ^ 2 - Else - result = ((1# - val) - 1#) ^ 2 - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i -End Sub diff --git a/.history/eohi1/brierVARS_20250922130242.vb b/.history/eohi1/brierVARS_20250922130242.vb deleted file mode 100644 index 1192e2f..0000000 --- a/.history/eohi1/brierVARS_20250922130242.vb +++ /dev/null @@ -1,141 +0,0 @@ -Function GetCol(ByVal headerName As String, ByVal ws As Worksheet) As Long - Dim lastCol As Long - lastCol = ws.Cells(1, ws.Columns.Count).End(xlToLeft).Column - On Error Resume Next - Dim m As Variant - m = Application.Match(headerName, ws.Range(ws.Cells(1, 1), ws.Cells(1, lastCol)), 0) - On Error GoTo 0 - If IsError(m) Then - GetCol = 0 - Else - GetCol = CLng(m) - End If -End Function - -Private Function NormalizeTruth(ByVal v As Variant) As Variant - ' Returns True/False or Null if cannot determine - If VarType(v) = vbBoolean Then - NormalizeTruth = v - Exit Function - End If - If IsError(v) Or IsEmpty(v) Then - NormalizeTruth = Null - Exit Function - End If - Dim s As String - s = Trim$(UCase$(CStr(v))) - Select Case s - Case "TRUE", "T", "1", "YES", "Y" - NormalizeTruth = True - Case "FALSE", "F", "0", "NO", "N" - NormalizeTruth = False - Case Else - NormalizeTruth = Null - End Select -End Function - -Private Function NormalizeProb01(ByVal v As Variant) As Double - ' Converts confidence values to [0,1] - If IsError(v) Or IsEmpty(v) Then - NormalizeProb01 = -1 - Exit Function - End If - Dim s As String - s = CStr(v) - If InStr(s, "%") > 0 Then - s = Replace$(s, "%", "") - If IsNumeric(s) Then - NormalizeProb01 = CDbl(s) / 100# - Exit Function - End If - End If - If IsNumeric(v) Then - Dim d As Double - d = CDbl(v) - If d > 1# Then - NormalizeProb01 = d / 100# - Else - NormalizeProb01 = d - End If - Else - NormalizeProb01 = -1 - End If -End Function - -Sub brierVARS() - Dim false_vars As Variant - Dim true_vars As Variant - false_vars = Array("moza_55_F_1", "moza_55_CON", "demo_15_F_1", "demo_15_CON", "hume_35_F_1", "hume_35_CON", "gulf_15_F_1", "gulf_15_CON", "memo_75_F_1", "memo_75_CON", "vitc_55_F_1", "vitc_55_CON", "hert_35_F_1", "hert_35_CON", "gees_55_F_1", "gees_55_CON", "gang_15_F_1", "gang_15_CON", "list_75_F_1", "list_75_CON", "mont_35_F_1", "mont_35_CON", "dwar_55_F_1", "dwar_55_CON", "pucc_15_F_1", "pucc_15_CON", "spee_75_F_1", "spee_75_CON", "lute_35_F_1", "lute_35_CON") - true_vars = Array("vaud_15_T_1", "vaud_15_CON", "oedi_35_T_1", "oedi_35_CON", "mons_55_T_1", "mons_55_CON", "gest_75_T_1", "gest_75_CON", "kabu_15_T_1", "kabu_15_CON", "sham_55_T_1", "sham_55_CON", "pana_35_T_1", "pana_35_CON", "bohr_15_T_1", "bohr_15_CON", "chur_75_T_1", "chur_75_CON", "carb_35_T_1", "carb_35_CON", "cons_55_T_1", "cons_55_CON", "papy_75_T_1", "papy_75_CON", "dors_55_T_1", "dors_55_CON", "tsun_75_T_1", "tsun_75_CON", "troy_15_T_1", "troy_15_CON", "lock_35_T_1", "lock_35_CON") - Dim target_headers As Variant - target_headers = Array("gest_T_ex", "dors_T_ex", "chur_T_ex", "mons_T_ex", "lock_T_easy", "hume_F_easy", "papy_T_easy", "sham_T_easy", "list_F_easy", "cons_T_easy", "tsun_T_easy", "pana_T_easy", "kabu_T_easy", "gulf_F_easy", "oedi_T_easy", "vaud_T_easy", "mont_F_easy", "demo_F_easy", "spee_F_hard", "dwar_F_hard", "carb_T_hard", "bohr_T_hard", "gang_F_hard", "vitc_F_hard", "hert_F_hard", "pucc_F_hard", "troy_T_hard", "moza_F_hard", "croc_F_hard", "gees_F_hard", "lute_F_hard", "memo_F_hard") - Dim ws As Worksheet - Set ws = ThisWorkbook.Sheets(1) - Dim i As Long, rowCount As Long, colSource1 As Long, colSourceCON As Long, colTarget As Long - Dim srcVar As String, matchPrefix As String, checkVal As String, val As Double - Dim result As Variant - Dim r As Long, j As Long - rowCount = ws.Cells(ws.Rows.Count, "A").End(xlUp).Row - For i = 0 To UBound(false_vars) Step 2 - srcVar = false_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(false_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = NormalizeProb01(ws.Cells(r, colSourceCON).Value) - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - Dim truth As Variant - truth = NormalizeTruth(checkVal) - If IsNull(truth) Or val < 0 Then - result = CVErr(xlErrNA) - ElseIf truth = True Then - result = (val - 0#) ^ 2 - Else - result = ((1# - val) - 0#) ^ 2 - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i - For i = 0 To UBound(true_vars) Step 2 - srcVar = true_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(true_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = NormalizeProb01(ws.Cells(r, colSourceCON).Value) - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - Dim truth2 As Variant - truth2 = NormalizeTruth(checkVal) - If IsNull(truth2) Or val < 0 Then - result = CVErr(xlErrNA) - ElseIf truth2 = True Then - result = (val - 1#) ^ 2 - Else - result = ((1# - val) - 1#) ^ 2 - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i -End Sub diff --git a/.history/eohi1/brierVARS_20250922130423.vb b/.history/eohi1/brierVARS_20250922130423.vb deleted file mode 100644 index ab96160..0000000 --- a/.history/eohi1/brierVARS_20250922130423.vb +++ /dev/null @@ -1,141 +0,0 @@ -Function GetCol(ByVal headerName As String, ByVal ws As Worksheet) As Long - Dim lastCol As Long - lastCol = ws.Cells(1, ws.Columns.Count).End(xlToLeft).Column - On Error Resume Next - Dim m As Variant - m = Application.Match(headerName, ws.Range(ws.Cells(1, 1), ws.Cells(1, lastCol)), 0) - On Error GoTo 0 - If IsError(m) Then - GetCol = 0 - Else - GetCol = CLng(m) - End If -End Function - -Private Function NormalizeTruth(ByVal v As Variant) As Variant - ' Returns True/False or Null if cannot determine - If VarType(v) = vbBoolean Then - NormalizeTruth = v - Exit Function - End If - If IsError(v) Or IsEmpty(v) Then - NormalizeTruth = Null - Exit Function - End If - Dim s As String - s = Trim$(UCase$(CStr(v))) - Select Case s - Case "TRUE", "T", "1", "YES", "Y" - NormalizeTruth = True - Case "FALSE", "F", "0", "NO", "N" - NormalizeTruth = False - Case Else - NormalizeTruth = Null - End Select -End Function - -Private Function NormalizeProb01(ByVal v As Variant) As Double - ' Converts confidence values to [0,1] - If IsError(v) Or IsEmpty(v) Then - NormalizeProb01 = -1 - Exit Function - End If - Dim s As String - s = CStr(v) - If InStr(s, "%") > 0 Then - s = Replace$(s, "%", "") - If IsNumeric(s) Then - NormalizeProb01 = CDbl(s) / 100# - Exit Function - End If - End If - If IsNumeric(v) Then - Dim d As Double - d = CDbl(v) - If d > 1# Then - NormalizeProb01 = d / 100# - Else - NormalizeProb01 = d - End If - Else - NormalizeProb01 = -1 - End If -End Function - -Sub brierVARS() - Dim false_vars As Variant - Dim true_vars As Variant - false_vars = Array("moza_55_F_1", "moza_55_CON", "demo_15_F_1", "demo_15_CON", "hume_35_F_1", "hume_35_CON", "gulf_15_F_1", "gulf_15_CON", "memo_75_F_1", "memo_75_CON", "vitc_55_F_1", "vitc_55_CON", "hert_35_F_1", "hert_35_CON", "gees_55_F_1", "gees_55_CON", "gang_15_F_1", "gang_15_CON", "list_75_F_1", "list_75_CON", "mont_35_F_1", "mont_35_CON", "dwar_55_F_1", "dwar_55_CON", "pucc_15_F_1", "pucc_15_CON", "spee_75_F_1", "spee_75_CON", "lute_35_F_1", "lute_35_CON", "croc_75_F_1", "croc_75_CON") - true_vars = Array("vaud_15_T_1", "vaud_15_CON", "oedi_35_T_1", "oedi_35_CON", "mons_55_T_1", "mons_55_CON", "gest_75_T_1", "gest_75_CON", "kabu_15_T_1", "kabu_15_CON", "sham_55_T_1", "sham_55_CON", "pana_35_T_1", "pana_35_CON", "bohr_15_T_1", "bohr_15_CON", "chur_75_T_1", "chur_75_CON", "carb_35_T_1", "carb_35_CON", "cons_55_T_1", "cons_55_CON", "papy_75_T_1", "papy_75_CON", "dors_55_T_1", "dors_55_CON", "tsun_75_T_1", "tsun_75_CON", "troy_15_T_1", "troy_15_CON", "lock_35_T_1", "lock_35_CON") - Dim target_headers As Variant - target_headers = Array("gest_T_ex", "dors_T_ex", "chur_T_ex", "mons_T_ex", "lock_T_easy", "hume_F_easy", "papy_T_easy", "sham_T_easy", "list_F_easy", "cons_T_easy", "tsun_T_easy", "pana_T_easy", "kabu_T_easy", "gulf_F_easy", "oedi_T_easy", "vaud_T_easy", "mont_F_easy", "demo_F_easy", "spee_F_hard", "dwar_F_hard", "carb_T_hard", "bohr_T_hard", "gang_F_hard", "vitc_F_hard", "hert_F_hard", "pucc_F_hard", "troy_T_hard", "moza_F_hard", "croc_F_hard", "gees_F_hard", "lute_F_hard", "memo_F_hard") - Dim ws As Worksheet - Set ws = ThisWorkbook.Sheets(1) - Dim i As Long, rowCount As Long, colSource1 As Long, colSourceCON As Long, colTarget As Long - Dim srcVar As String, matchPrefix As String, checkVal As String, val As Double - Dim result As Variant - Dim r As Long, j As Long - rowCount = ws.Cells(ws.Rows.Count, "A").End(xlUp).Row - For i = 0 To UBound(false_vars) Step 2 - srcVar = false_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(false_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = NormalizeProb01(ws.Cells(r, colSourceCON).Value) - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - Dim truth As Variant - truth = NormalizeTruth(checkVal) - If IsNull(truth) Or val < 0 Then - result = CVErr(xlErrNA) - ElseIf truth = True Then - result = (val - 0#) ^ 2 - Else - result = ((1# - val) - 0#) ^ 2 - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i - For i = 0 To UBound(true_vars) Step 2 - srcVar = true_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(true_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = NormalizeProb01(ws.Cells(r, colSourceCON).Value) - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - Dim truth2 As Variant - truth2 = NormalizeTruth(checkVal) - If IsNull(truth2) Or val < 0 Then - result = CVErr(xlErrNA) - ElseIf truth2 = True Then - result = (val - 1#) ^ 2 - Else - result = ((1# - val) - 1#) ^ 2 - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i -End Sub diff --git a/.history/eohi1/brierVARS_20250922130433.vb b/.history/eohi1/brierVARS_20250922130433.vb deleted file mode 100644 index ab96160..0000000 --- a/.history/eohi1/brierVARS_20250922130433.vb +++ /dev/null @@ -1,141 +0,0 @@ -Function GetCol(ByVal headerName As String, ByVal ws As Worksheet) As Long - Dim lastCol As Long - lastCol = ws.Cells(1, ws.Columns.Count).End(xlToLeft).Column - On Error Resume Next - Dim m As Variant - m = Application.Match(headerName, ws.Range(ws.Cells(1, 1), ws.Cells(1, lastCol)), 0) - On Error GoTo 0 - If IsError(m) Then - GetCol = 0 - Else - GetCol = CLng(m) - End If -End Function - -Private Function NormalizeTruth(ByVal v As Variant) As Variant - ' Returns True/False or Null if cannot determine - If VarType(v) = vbBoolean Then - NormalizeTruth = v - Exit Function - End If - If IsError(v) Or IsEmpty(v) Then - NormalizeTruth = Null - Exit Function - End If - Dim s As String - s = Trim$(UCase$(CStr(v))) - Select Case s - Case "TRUE", "T", "1", "YES", "Y" - NormalizeTruth = True - Case "FALSE", "F", "0", "NO", "N" - NormalizeTruth = False - Case Else - NormalizeTruth = Null - End Select -End Function - -Private Function NormalizeProb01(ByVal v As Variant) As Double - ' Converts confidence values to [0,1] - If IsError(v) Or IsEmpty(v) Then - NormalizeProb01 = -1 - Exit Function - End If - Dim s As String - s = CStr(v) - If InStr(s, "%") > 0 Then - s = Replace$(s, "%", "") - If IsNumeric(s) Then - NormalizeProb01 = CDbl(s) / 100# - Exit Function - End If - End If - If IsNumeric(v) Then - Dim d As Double - d = CDbl(v) - If d > 1# Then - NormalizeProb01 = d / 100# - Else - NormalizeProb01 = d - End If - Else - NormalizeProb01 = -1 - End If -End Function - -Sub brierVARS() - Dim false_vars As Variant - Dim true_vars As Variant - false_vars = Array("moza_55_F_1", "moza_55_CON", "demo_15_F_1", "demo_15_CON", "hume_35_F_1", "hume_35_CON", "gulf_15_F_1", "gulf_15_CON", "memo_75_F_1", "memo_75_CON", "vitc_55_F_1", "vitc_55_CON", "hert_35_F_1", "hert_35_CON", "gees_55_F_1", "gees_55_CON", "gang_15_F_1", "gang_15_CON", "list_75_F_1", "list_75_CON", "mont_35_F_1", "mont_35_CON", "dwar_55_F_1", "dwar_55_CON", "pucc_15_F_1", "pucc_15_CON", "spee_75_F_1", "spee_75_CON", "lute_35_F_1", "lute_35_CON", "croc_75_F_1", "croc_75_CON") - true_vars = Array("vaud_15_T_1", "vaud_15_CON", "oedi_35_T_1", "oedi_35_CON", "mons_55_T_1", "mons_55_CON", "gest_75_T_1", "gest_75_CON", "kabu_15_T_1", "kabu_15_CON", "sham_55_T_1", "sham_55_CON", "pana_35_T_1", "pana_35_CON", "bohr_15_T_1", "bohr_15_CON", "chur_75_T_1", "chur_75_CON", "carb_35_T_1", "carb_35_CON", "cons_55_T_1", "cons_55_CON", "papy_75_T_1", "papy_75_CON", "dors_55_T_1", "dors_55_CON", "tsun_75_T_1", "tsun_75_CON", "troy_15_T_1", "troy_15_CON", "lock_35_T_1", "lock_35_CON") - Dim target_headers As Variant - target_headers = Array("gest_T_ex", "dors_T_ex", "chur_T_ex", "mons_T_ex", "lock_T_easy", "hume_F_easy", "papy_T_easy", "sham_T_easy", "list_F_easy", "cons_T_easy", "tsun_T_easy", "pana_T_easy", "kabu_T_easy", "gulf_F_easy", "oedi_T_easy", "vaud_T_easy", "mont_F_easy", "demo_F_easy", "spee_F_hard", "dwar_F_hard", "carb_T_hard", "bohr_T_hard", "gang_F_hard", "vitc_F_hard", "hert_F_hard", "pucc_F_hard", "troy_T_hard", "moza_F_hard", "croc_F_hard", "gees_F_hard", "lute_F_hard", "memo_F_hard") - Dim ws As Worksheet - Set ws = ThisWorkbook.Sheets(1) - Dim i As Long, rowCount As Long, colSource1 As Long, colSourceCON As Long, colTarget As Long - Dim srcVar As String, matchPrefix As String, checkVal As String, val As Double - Dim result As Variant - Dim r As Long, j As Long - rowCount = ws.Cells(ws.Rows.Count, "A").End(xlUp).Row - For i = 0 To UBound(false_vars) Step 2 - srcVar = false_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(false_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = NormalizeProb01(ws.Cells(r, colSourceCON).Value) - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - Dim truth As Variant - truth = NormalizeTruth(checkVal) - If IsNull(truth) Or val < 0 Then - result = CVErr(xlErrNA) - ElseIf truth = True Then - result = (val - 0#) ^ 2 - Else - result = ((1# - val) - 0#) ^ 2 - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i - For i = 0 To UBound(true_vars) Step 2 - srcVar = true_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(true_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = NormalizeProb01(ws.Cells(r, colSourceCON).Value) - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - Dim truth2 As Variant - truth2 = NormalizeTruth(checkVal) - If IsNull(truth2) Or val < 0 Then - result = CVErr(xlErrNA) - ElseIf truth2 = True Then - result = (val - 1#) ^ 2 - Else - result = ((1# - val) - 1#) ^ 2 - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i -End Sub diff --git a/.history/eohi1/brierVARS_20250922130435.vb b/.history/eohi1/brierVARS_20250922130435.vb deleted file mode 100644 index ab96160..0000000 --- a/.history/eohi1/brierVARS_20250922130435.vb +++ /dev/null @@ -1,141 +0,0 @@ -Function GetCol(ByVal headerName As String, ByVal ws As Worksheet) As Long - Dim lastCol As Long - lastCol = ws.Cells(1, ws.Columns.Count).End(xlToLeft).Column - On Error Resume Next - Dim m As Variant - m = Application.Match(headerName, ws.Range(ws.Cells(1, 1), ws.Cells(1, lastCol)), 0) - On Error GoTo 0 - If IsError(m) Then - GetCol = 0 - Else - GetCol = CLng(m) - End If -End Function - -Private Function NormalizeTruth(ByVal v As Variant) As Variant - ' Returns True/False or Null if cannot determine - If VarType(v) = vbBoolean Then - NormalizeTruth = v - Exit Function - End If - If IsError(v) Or IsEmpty(v) Then - NormalizeTruth = Null - Exit Function - End If - Dim s As String - s = Trim$(UCase$(CStr(v))) - Select Case s - Case "TRUE", "T", "1", "YES", "Y" - NormalizeTruth = True - Case "FALSE", "F", "0", "NO", "N" - NormalizeTruth = False - Case Else - NormalizeTruth = Null - End Select -End Function - -Private Function NormalizeProb01(ByVal v As Variant) As Double - ' Converts confidence values to [0,1] - If IsError(v) Or IsEmpty(v) Then - NormalizeProb01 = -1 - Exit Function - End If - Dim s As String - s = CStr(v) - If InStr(s, "%") > 0 Then - s = Replace$(s, "%", "") - If IsNumeric(s) Then - NormalizeProb01 = CDbl(s) / 100# - Exit Function - End If - End If - If IsNumeric(v) Then - Dim d As Double - d = CDbl(v) - If d > 1# Then - NormalizeProb01 = d / 100# - Else - NormalizeProb01 = d - End If - Else - NormalizeProb01 = -1 - End If -End Function - -Sub brierVARS() - Dim false_vars As Variant - Dim true_vars As Variant - false_vars = Array("moza_55_F_1", "moza_55_CON", "demo_15_F_1", "demo_15_CON", "hume_35_F_1", "hume_35_CON", "gulf_15_F_1", "gulf_15_CON", "memo_75_F_1", "memo_75_CON", "vitc_55_F_1", "vitc_55_CON", "hert_35_F_1", "hert_35_CON", "gees_55_F_1", "gees_55_CON", "gang_15_F_1", "gang_15_CON", "list_75_F_1", "list_75_CON", "mont_35_F_1", "mont_35_CON", "dwar_55_F_1", "dwar_55_CON", "pucc_15_F_1", "pucc_15_CON", "spee_75_F_1", "spee_75_CON", "lute_35_F_1", "lute_35_CON", "croc_75_F_1", "croc_75_CON") - true_vars = Array("vaud_15_T_1", "vaud_15_CON", "oedi_35_T_1", "oedi_35_CON", "mons_55_T_1", "mons_55_CON", "gest_75_T_1", "gest_75_CON", "kabu_15_T_1", "kabu_15_CON", "sham_55_T_1", "sham_55_CON", "pana_35_T_1", "pana_35_CON", "bohr_15_T_1", "bohr_15_CON", "chur_75_T_1", "chur_75_CON", "carb_35_T_1", "carb_35_CON", "cons_55_T_1", "cons_55_CON", "papy_75_T_1", "papy_75_CON", "dors_55_T_1", "dors_55_CON", "tsun_75_T_1", "tsun_75_CON", "troy_15_T_1", "troy_15_CON", "lock_35_T_1", "lock_35_CON") - Dim target_headers As Variant - target_headers = Array("gest_T_ex", "dors_T_ex", "chur_T_ex", "mons_T_ex", "lock_T_easy", "hume_F_easy", "papy_T_easy", "sham_T_easy", "list_F_easy", "cons_T_easy", "tsun_T_easy", "pana_T_easy", "kabu_T_easy", "gulf_F_easy", "oedi_T_easy", "vaud_T_easy", "mont_F_easy", "demo_F_easy", "spee_F_hard", "dwar_F_hard", "carb_T_hard", "bohr_T_hard", "gang_F_hard", "vitc_F_hard", "hert_F_hard", "pucc_F_hard", "troy_T_hard", "moza_F_hard", "croc_F_hard", "gees_F_hard", "lute_F_hard", "memo_F_hard") - Dim ws As Worksheet - Set ws = ThisWorkbook.Sheets(1) - Dim i As Long, rowCount As Long, colSource1 As Long, colSourceCON As Long, colTarget As Long - Dim srcVar As String, matchPrefix As String, checkVal As String, val As Double - Dim result As Variant - Dim r As Long, j As Long - rowCount = ws.Cells(ws.Rows.Count, "A").End(xlUp).Row - For i = 0 To UBound(false_vars) Step 2 - srcVar = false_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(false_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = NormalizeProb01(ws.Cells(r, colSourceCON).Value) - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - Dim truth As Variant - truth = NormalizeTruth(checkVal) - If IsNull(truth) Or val < 0 Then - result = CVErr(xlErrNA) - ElseIf truth = True Then - result = (val - 0#) ^ 2 - Else - result = ((1# - val) - 0#) ^ 2 - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i - For i = 0 To UBound(true_vars) Step 2 - srcVar = true_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(true_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = NormalizeProb01(ws.Cells(r, colSourceCON).Value) - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - Dim truth2 As Variant - truth2 = NormalizeTruth(checkVal) - If IsNull(truth2) Or val < 0 Then - result = CVErr(xlErrNA) - ElseIf truth2 = True Then - result = (val - 1#) ^ 2 - Else - result = ((1# - val) - 1#) ^ 2 - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i -End Sub diff --git a/.history/eohi1/brierVARS_20250922131020.vb b/.history/eohi1/brierVARS_20250922131020.vb deleted file mode 100644 index ab96160..0000000 --- a/.history/eohi1/brierVARS_20250922131020.vb +++ /dev/null @@ -1,141 +0,0 @@ -Function GetCol(ByVal headerName As String, ByVal ws As Worksheet) As Long - Dim lastCol As Long - lastCol = ws.Cells(1, ws.Columns.Count).End(xlToLeft).Column - On Error Resume Next - Dim m As Variant - m = Application.Match(headerName, ws.Range(ws.Cells(1, 1), ws.Cells(1, lastCol)), 0) - On Error GoTo 0 - If IsError(m) Then - GetCol = 0 - Else - GetCol = CLng(m) - End If -End Function - -Private Function NormalizeTruth(ByVal v As Variant) As Variant - ' Returns True/False or Null if cannot determine - If VarType(v) = vbBoolean Then - NormalizeTruth = v - Exit Function - End If - If IsError(v) Or IsEmpty(v) Then - NormalizeTruth = Null - Exit Function - End If - Dim s As String - s = Trim$(UCase$(CStr(v))) - Select Case s - Case "TRUE", "T", "1", "YES", "Y" - NormalizeTruth = True - Case "FALSE", "F", "0", "NO", "N" - NormalizeTruth = False - Case Else - NormalizeTruth = Null - End Select -End Function - -Private Function NormalizeProb01(ByVal v As Variant) As Double - ' Converts confidence values to [0,1] - If IsError(v) Or IsEmpty(v) Then - NormalizeProb01 = -1 - Exit Function - End If - Dim s As String - s = CStr(v) - If InStr(s, "%") > 0 Then - s = Replace$(s, "%", "") - If IsNumeric(s) Then - NormalizeProb01 = CDbl(s) / 100# - Exit Function - End If - End If - If IsNumeric(v) Then - Dim d As Double - d = CDbl(v) - If d > 1# Then - NormalizeProb01 = d / 100# - Else - NormalizeProb01 = d - End If - Else - NormalizeProb01 = -1 - End If -End Function - -Sub brierVARS() - Dim false_vars As Variant - Dim true_vars As Variant - false_vars = Array("moza_55_F_1", "moza_55_CON", "demo_15_F_1", "demo_15_CON", "hume_35_F_1", "hume_35_CON", "gulf_15_F_1", "gulf_15_CON", "memo_75_F_1", "memo_75_CON", "vitc_55_F_1", "vitc_55_CON", "hert_35_F_1", "hert_35_CON", "gees_55_F_1", "gees_55_CON", "gang_15_F_1", "gang_15_CON", "list_75_F_1", "list_75_CON", "mont_35_F_1", "mont_35_CON", "dwar_55_F_1", "dwar_55_CON", "pucc_15_F_1", "pucc_15_CON", "spee_75_F_1", "spee_75_CON", "lute_35_F_1", "lute_35_CON", "croc_75_F_1", "croc_75_CON") - true_vars = Array("vaud_15_T_1", "vaud_15_CON", "oedi_35_T_1", "oedi_35_CON", "mons_55_T_1", "mons_55_CON", "gest_75_T_1", "gest_75_CON", "kabu_15_T_1", "kabu_15_CON", "sham_55_T_1", "sham_55_CON", "pana_35_T_1", "pana_35_CON", "bohr_15_T_1", "bohr_15_CON", "chur_75_T_1", "chur_75_CON", "carb_35_T_1", "carb_35_CON", "cons_55_T_1", "cons_55_CON", "papy_75_T_1", "papy_75_CON", "dors_55_T_1", "dors_55_CON", "tsun_75_T_1", "tsun_75_CON", "troy_15_T_1", "troy_15_CON", "lock_35_T_1", "lock_35_CON") - Dim target_headers As Variant - target_headers = Array("gest_T_ex", "dors_T_ex", "chur_T_ex", "mons_T_ex", "lock_T_easy", "hume_F_easy", "papy_T_easy", "sham_T_easy", "list_F_easy", "cons_T_easy", "tsun_T_easy", "pana_T_easy", "kabu_T_easy", "gulf_F_easy", "oedi_T_easy", "vaud_T_easy", "mont_F_easy", "demo_F_easy", "spee_F_hard", "dwar_F_hard", "carb_T_hard", "bohr_T_hard", "gang_F_hard", "vitc_F_hard", "hert_F_hard", "pucc_F_hard", "troy_T_hard", "moza_F_hard", "croc_F_hard", "gees_F_hard", "lute_F_hard", "memo_F_hard") - Dim ws As Worksheet - Set ws = ThisWorkbook.Sheets(1) - Dim i As Long, rowCount As Long, colSource1 As Long, colSourceCON As Long, colTarget As Long - Dim srcVar As String, matchPrefix As String, checkVal As String, val As Double - Dim result As Variant - Dim r As Long, j As Long - rowCount = ws.Cells(ws.Rows.Count, "A").End(xlUp).Row - For i = 0 To UBound(false_vars) Step 2 - srcVar = false_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(false_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = NormalizeProb01(ws.Cells(r, colSourceCON).Value) - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - Dim truth As Variant - truth = NormalizeTruth(checkVal) - If IsNull(truth) Or val < 0 Then - result = CVErr(xlErrNA) - ElseIf truth = True Then - result = (val - 0#) ^ 2 - Else - result = ((1# - val) - 0#) ^ 2 - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i - For i = 0 To UBound(true_vars) Step 2 - srcVar = true_vars(i) - matchPrefix = Left(srcVar, InStrRev(srcVar, "_") - 1) - colSource1 = GetCol(srcVar, ws) - colSourceCON = GetCol(true_vars(i + 1), ws) - If colSource1 > 0 And colSourceCON > 0 Then - For r = 2 To rowCount - checkVal = ws.Cells(r, colSource1).Value - val = NormalizeProb01(ws.Cells(r, colSourceCON).Value) - colTarget = 0 - For j = 0 To UBound(target_headers) - If Left(target_headers(j), InStr(target_headers(j), "_") - 1) = Left(matchPrefix, InStr(matchPrefix, "_") - 1) Then - colTarget = GetCol(target_headers(j), ws) - Exit For - End If - Next j - If colTarget > 0 Then - Dim truth2 As Variant - truth2 = NormalizeTruth(checkVal) - If IsNull(truth2) Or val < 0 Then - result = CVErr(xlErrNA) - ElseIf truth2 = True Then - result = (val - 1#) ^ 2 - Else - result = ((1# - val) - 1#) ^ 2 - End If - ws.Cells(r, colTarget).Value = result - End If - Next r - End If - Next i -End Sub diff --git a/.history/eohi1/correlation matrix_20251027115053.r b/.history/eohi1/correlation matrix_20251027115053.r deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi1/correlation matrix_20251027115054.r b/.history/eohi1/correlation matrix_20251027115054.r deleted file mode 100644 index 52ea22d..0000000 --- a/.history/eohi1/correlation matrix_20251027115054.r +++ /dev/null @@ -1,40 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -str(data) -colSums(is.na(data)) -sapply(data, class) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251027115900.r b/.history/eohi1/correlation matrix_20251027115900.r deleted file mode 100644 index 9b89a6a..0000000 --- a/.history/eohi1/correlation matrix_20251027115900.r +++ /dev/null @@ -1,50 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251027115902.r b/.history/eohi1/correlation matrix_20251027115902.r deleted file mode 100644 index 9b89a6a..0000000 --- a/.history/eohi1/correlation matrix_20251027115902.r +++ /dev/null @@ -1,50 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251027115905.r b/.history/eohi1/correlation matrix_20251027115905.r deleted file mode 100644 index 9b89a6a..0000000 --- a/.history/eohi1/correlation matrix_20251027115905.r +++ /dev/null @@ -1,50 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251027120022.r b/.history/eohi1/correlation matrix_20251027120022.r deleted file mode 100644 index 9b89a6a..0000000 --- a/.history/eohi1/correlation matrix_20251027120022.r +++ /dev/null @@ -1,50 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251027120056.r b/.history/eohi1/correlation matrix_20251027120056.r deleted file mode 100644 index 1843a3d..0000000 --- a/.history/eohi1/correlation matrix_20251027120056.r +++ /dev/null @@ -1,67 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(eohiDGEN_mean, ehi_global_mean, sex_dummy, age_centered, edu_num, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) - -# Create correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs") - -# Print correlation matrix -print("Correlation Matrix:") -print(round(cor_matrix, 3)) - -# Save correlation matrix to CSV -write.csv(cor_matrix, "correlation_matrix.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251027120100.r b/.history/eohi1/correlation matrix_20251027120100.r deleted file mode 100644 index 1843a3d..0000000 --- a/.history/eohi1/correlation matrix_20251027120100.r +++ /dev/null @@ -1,67 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(eohiDGEN_mean, ehi_global_mean, sex_dummy, age_centered, edu_num, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) - -# Create correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs") - -# Print correlation matrix -print("Correlation Matrix:") -print(round(cor_matrix, 3)) - -# Save correlation matrix to CSV -write.csv(cor_matrix, "correlation_matrix.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251027120122.r b/.history/eohi1/correlation matrix_20251027120122.r deleted file mode 100644 index 1843a3d..0000000 --- a/.history/eohi1/correlation matrix_20251027120122.r +++ /dev/null @@ -1,67 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(eohiDGEN_mean, ehi_global_mean, sex_dummy, age_centered, edu_num, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) - -# Create correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs") - -# Print correlation matrix -print("Correlation Matrix:") -print(round(cor_matrix, 3)) - -# Save correlation matrix to CSV -write.csv(cor_matrix, "correlation_matrix.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251027120345.r b/.history/eohi1/correlation matrix_20251027120345.r deleted file mode 100644 index 18da719..0000000 --- a/.history/eohi1/correlation matrix_20251027120345.r +++ /dev/null @@ -1,67 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(eohiDGEN_mean, ehi_global_mean, sex_dummy, demo_age_1, edu_num, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print("Correlation Matrix:") -print(round(cor_matrix, 3)) - -# Save correlation matrix to CSV -write.csv(cor_matrix, "correlation_matrix.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251027120348.r b/.history/eohi1/correlation matrix_20251027120348.r deleted file mode 100644 index 18da719..0000000 --- a/.history/eohi1/correlation matrix_20251027120348.r +++ /dev/null @@ -1,67 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(eohiDGEN_mean, ehi_global_mean, sex_dummy, demo_age_1, edu_num, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print("Correlation Matrix:") -print(round(cor_matrix, 3)) - -# Save correlation matrix to CSV -write.csv(cor_matrix, "correlation_matrix.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251027120351.r b/.history/eohi1/correlation matrix_20251027120351.r deleted file mode 100644 index 18da719..0000000 --- a/.history/eohi1/correlation matrix_20251027120351.r +++ /dev/null @@ -1,67 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(eohiDGEN_mean, ehi_global_mean, sex_dummy, demo_age_1, edu_num, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print("Correlation Matrix:") -print(round(cor_matrix, 3)) - -# Save correlation matrix to CSV -write.csv(cor_matrix, "correlation_matrix.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251027120448.r b/.history/eohi1/correlation matrix_20251027120448.r deleted file mode 100644 index 7389450..0000000 --- a/.history/eohi1/correlation matrix_20251027120448.r +++ /dev/null @@ -1,76 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(eohiDGEN_mean, ehi_global_mean, sex_dummy, demo_age_1, edu_num, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print(round(cor_matrix, 3)) - -# Get significance tests for correlations -library(Hmisc) -cor_test <- rcorr(as.matrix(numeric_vars), type = "spearman") - -# Print correlation matrix with significance -print("Correlation Matrix with Significance:") -print(cor_test$r) -print("\nP-values:") -print(cor_test$P) - -# Save correlation matrix to CSV -write.csv(cor_matrix, "correlation_matrix.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251027120450.r b/.history/eohi1/correlation matrix_20251027120450.r deleted file mode 100644 index 7389450..0000000 --- a/.history/eohi1/correlation matrix_20251027120450.r +++ /dev/null @@ -1,76 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(eohiDGEN_mean, ehi_global_mean, sex_dummy, demo_age_1, edu_num, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print(round(cor_matrix, 3)) - -# Get significance tests for correlations -library(Hmisc) -cor_test <- rcorr(as.matrix(numeric_vars), type = "spearman") - -# Print correlation matrix with significance -print("Correlation Matrix with Significance:") -print(cor_test$r) -print("\nP-values:") -print(cor_test$P) - -# Save correlation matrix to CSV -write.csv(cor_matrix, "correlation_matrix.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251027120505.r b/.history/eohi1/correlation matrix_20251027120505.r deleted file mode 100644 index 7389450..0000000 --- a/.history/eohi1/correlation matrix_20251027120505.r +++ /dev/null @@ -1,76 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(eohiDGEN_mean, ehi_global_mean, sex_dummy, demo_age_1, edu_num, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print(round(cor_matrix, 3)) - -# Get significance tests for correlations -library(Hmisc) -cor_test <- rcorr(as.matrix(numeric_vars), type = "spearman") - -# Print correlation matrix with significance -print("Correlation Matrix with Significance:") -print(cor_test$r) -print("\nP-values:") -print(cor_test$P) - -# Save correlation matrix to CSV -write.csv(cor_matrix, "correlation_matrix.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251027120720.r b/.history/eohi1/correlation matrix_20251027120720.r deleted file mode 100644 index e097850..0000000 --- a/.history/eohi1/correlation matrix_20251027120720.r +++ /dev/null @@ -1,97 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(eohiDGEN_mean, ehi_global_mean, sex_dummy, demo_age_1, edu_num, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print(round(cor_matrix, 3)) - -# Get significance tests for correlations using psych package -library(psych) - -# Create correlation matrix with significance tests -cor_test <- corr.test(numeric_vars, method = "spearman", adjust = "none") - -# Print correlation matrix -print("Correlation Matrix:") -print(round(cor_test$r, 3)) - -# Print p-values -print("\nP-values:") -print(round(cor_test$p, 3)) - -# Print significant correlations (p < 0.05) -print("\nSignificant correlations (p < 0.05):") -sig_cors <- which(cor_test$p < 0.05 & cor_test$p != 0, arr.ind = TRUE) -if(nrow(sig_cors) > 0) { - for(i in 1:nrow(sig_cors)) { - row_idx <- sig_cors[i, 1] - col_idx <- sig_cors[i, 2] - if(row_idx != col_idx) { # Skip diagonal - cat(colnames(numeric_vars)[row_idx], "vs", colnames(numeric_vars)[col_idx], - ": r =", round(cor_test$r[row_idx, col_idx], 3), - ", p =", round(cor_test$p[row_idx, col_idx], 3), "\n") - } - } -} else { - print("No significant correlations found.") -} - -# Save correlation matrix to CSV -write.csv(cor_matrix, "correlation_matrix.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251027120722.r b/.history/eohi1/correlation matrix_20251027120722.r deleted file mode 100644 index e097850..0000000 --- a/.history/eohi1/correlation matrix_20251027120722.r +++ /dev/null @@ -1,97 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(eohiDGEN_mean, ehi_global_mean, sex_dummy, demo_age_1, edu_num, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print(round(cor_matrix, 3)) - -# Get significance tests for correlations using psych package -library(psych) - -# Create correlation matrix with significance tests -cor_test <- corr.test(numeric_vars, method = "spearman", adjust = "none") - -# Print correlation matrix -print("Correlation Matrix:") -print(round(cor_test$r, 3)) - -# Print p-values -print("\nP-values:") -print(round(cor_test$p, 3)) - -# Print significant correlations (p < 0.05) -print("\nSignificant correlations (p < 0.05):") -sig_cors <- which(cor_test$p < 0.05 & cor_test$p != 0, arr.ind = TRUE) -if(nrow(sig_cors) > 0) { - for(i in 1:nrow(sig_cors)) { - row_idx <- sig_cors[i, 1] - col_idx <- sig_cors[i, 2] - if(row_idx != col_idx) { # Skip diagonal - cat(colnames(numeric_vars)[row_idx], "vs", colnames(numeric_vars)[col_idx], - ": r =", round(cor_test$r[row_idx, col_idx], 3), - ", p =", round(cor_test$p[row_idx, col_idx], 3), "\n") - } - } -} else { - print("No significant correlations found.") -} - -# Save correlation matrix to CSV -write.csv(cor_matrix, "correlation_matrix.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251027120752.r b/.history/eohi1/correlation matrix_20251027120752.r deleted file mode 100644 index 97c8ede..0000000 --- a/.history/eohi1/correlation matrix_20251027120752.r +++ /dev/null @@ -1,99 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(eohiDGEN_mean, ehi_global_mean, sex_dummy, demo_age_1, edu_num, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print(round(cor_matrix, 3)) - -# Get significance tests for correlations using psych package -library(psych) - -# Create correlation matrix with significance tests -cor_test <- corr.test(numeric_vars, method = "spearman", adjust = "none") - -# Print correlation matrix -print("Correlation Matrix:") -print(round(cor_test$r, 3)) - -# Print p-values -print("\nP-values:") -print(round(cor_test$p, 3)) - -# Print significant correlations (p < 0.05) -print("\nSignificant correlations (p < 0.05):") -sig_cors <- which(cor_test$p < 0.05 & cor_test$p != 0, arr.ind = TRUE) -if(nrow(sig_cors) > 0) { - for(i in 1:nrow(sig_cors)) { - row_idx <- sig_cors[i, 1] - col_idx <- sig_cors[i, 2] - if(row_idx != col_idx) { # Skip diagonal - cat(colnames(numeric_vars)[row_idx], "vs", colnames(numeric_vars)[col_idx], - ": r =", round(cor_test$r[row_idx, col_idx], 3), - ", p =", round(cor_test$p[row_idx, col_idx], 3), "\n") - } - } -} else { - print("No significant correlations found.") -} - -# Save correlation matrix and p-values to CSV files -write.csv(cor_test$r, "correlation_matrix.csv", row.names = TRUE) -write.csv(cor_test$p, "correlation_pvalues.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") -print("P-values saved to correlation_pvalues.csv") \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251027120754.r b/.history/eohi1/correlation matrix_20251027120754.r deleted file mode 100644 index 97c8ede..0000000 --- a/.history/eohi1/correlation matrix_20251027120754.r +++ /dev/null @@ -1,99 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(eohiDGEN_mean, ehi_global_mean, sex_dummy, demo_age_1, edu_num, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print(round(cor_matrix, 3)) - -# Get significance tests for correlations using psych package -library(psych) - -# Create correlation matrix with significance tests -cor_test <- corr.test(numeric_vars, method = "spearman", adjust = "none") - -# Print correlation matrix -print("Correlation Matrix:") -print(round(cor_test$r, 3)) - -# Print p-values -print("\nP-values:") -print(round(cor_test$p, 3)) - -# Print significant correlations (p < 0.05) -print("\nSignificant correlations (p < 0.05):") -sig_cors <- which(cor_test$p < 0.05 & cor_test$p != 0, arr.ind = TRUE) -if(nrow(sig_cors) > 0) { - for(i in 1:nrow(sig_cors)) { - row_idx <- sig_cors[i, 1] - col_idx <- sig_cors[i, 2] - if(row_idx != col_idx) { # Skip diagonal - cat(colnames(numeric_vars)[row_idx], "vs", colnames(numeric_vars)[col_idx], - ": r =", round(cor_test$r[row_idx, col_idx], 3), - ", p =", round(cor_test$p[row_idx, col_idx], 3), "\n") - } - } -} else { - print("No significant correlations found.") -} - -# Save correlation matrix and p-values to CSV files -write.csv(cor_test$r, "correlation_matrix.csv", row.names = TRUE) -write.csv(cor_test$p, "correlation_pvalues.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") -print("P-values saved to correlation_pvalues.csv") \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251027120804.r b/.history/eohi1/correlation matrix_20251027120804.r deleted file mode 100644 index 97c8ede..0000000 --- a/.history/eohi1/correlation matrix_20251027120804.r +++ /dev/null @@ -1,99 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(eohiDGEN_mean, ehi_global_mean, sex_dummy, demo_age_1, edu_num, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print(round(cor_matrix, 3)) - -# Get significance tests for correlations using psych package -library(psych) - -# Create correlation matrix with significance tests -cor_test <- corr.test(numeric_vars, method = "spearman", adjust = "none") - -# Print correlation matrix -print("Correlation Matrix:") -print(round(cor_test$r, 3)) - -# Print p-values -print("\nP-values:") -print(round(cor_test$p, 3)) - -# Print significant correlations (p < 0.05) -print("\nSignificant correlations (p < 0.05):") -sig_cors <- which(cor_test$p < 0.05 & cor_test$p != 0, arr.ind = TRUE) -if(nrow(sig_cors) > 0) { - for(i in 1:nrow(sig_cors)) { - row_idx <- sig_cors[i, 1] - col_idx <- sig_cors[i, 2] - if(row_idx != col_idx) { # Skip diagonal - cat(colnames(numeric_vars)[row_idx], "vs", colnames(numeric_vars)[col_idx], - ": r =", round(cor_test$r[row_idx, col_idx], 3), - ", p =", round(cor_test$p[row_idx, col_idx], 3), "\n") - } - } -} else { - print("No significant correlations found.") -} - -# Save correlation matrix and p-values to CSV files -write.csv(cor_test$r, "correlation_matrix.csv", row.names = TRUE) -write.csv(cor_test$p, "correlation_pvalues.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") -print("P-values saved to correlation_pvalues.csv") \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251027120919.r b/.history/eohi1/correlation matrix_20251027120919.r deleted file mode 100644 index 156c379..0000000 --- a/.history/eohi1/correlation matrix_20251027120919.r +++ /dev/null @@ -1,111 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(eohiDGEN_mean, ehi_global_mean, sex_dummy, demo_age_1, edu_num, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print(round(cor_matrix, 3)) - -# Get significance tests for correlations using psych package -library(psych) - -# Create correlation matrix with significance tests -cor_test <- corr.test(numeric_vars, method = "spearman", adjust = "none") - -# Print correlation matrix -print("Correlation Matrix (Spearman r values):") -print(round(cor_test$r, 3)) - -# Print p-values -print("\nP-values Matrix:") -print(round(cor_test$p, 3)) - -# Print all correlations with r and p values (for reporting) -print("\nAll correlations with r and p values:") -for(i in 1:nrow(cor_test$r)) { - for(j in 1:ncol(cor_test$r)) { - if(i != j) { # Skip diagonal - cat(colnames(numeric_vars)[i], "vs", colnames(numeric_vars)[j], - ": r =", round(cor_test$r[i, j], 3), - ", p =", round(cor_test$p[i, j], 3), "\n") - } - } -} - -# Also print significant correlations summary -print("\nSignificant correlations (p < 0.05):") -sig_cors <- which(cor_test$p < 0.05 & cor_test$p != 0, arr.ind = TRUE) -if(nrow(sig_cors) > 0) { - for(i in 1:nrow(sig_cors)) { - row_idx <- sig_cors[i, 1] - col_idx <- sig_cors[i, 2] - if(row_idx != col_idx) { # Skip diagonal - cat(colnames(numeric_vars)[row_idx], "vs", colnames(numeric_vars)[col_idx], - ": r =", round(cor_test$r[row_idx, col_idx], 3), - ", p =", round(cor_test$p[row_idx, col_idx], 3), "\n") - } - } -} else { - print("No significant correlations found.") -} - -# Save correlation matrix and p-values to CSV files -write.csv(cor_test$r, "correlation_matrix.csv", row.names = TRUE) -write.csv(cor_test$p, "correlation_pvalues.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") -print("P-values saved to correlation_pvalues.csv") \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251027120930.r b/.history/eohi1/correlation matrix_20251027120930.r deleted file mode 100644 index 156c379..0000000 --- a/.history/eohi1/correlation matrix_20251027120930.r +++ /dev/null @@ -1,111 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(eohiDGEN_mean, ehi_global_mean, sex_dummy, demo_age_1, edu_num, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print(round(cor_matrix, 3)) - -# Get significance tests for correlations using psych package -library(psych) - -# Create correlation matrix with significance tests -cor_test <- corr.test(numeric_vars, method = "spearman", adjust = "none") - -# Print correlation matrix -print("Correlation Matrix (Spearman r values):") -print(round(cor_test$r, 3)) - -# Print p-values -print("\nP-values Matrix:") -print(round(cor_test$p, 3)) - -# Print all correlations with r and p values (for reporting) -print("\nAll correlations with r and p values:") -for(i in 1:nrow(cor_test$r)) { - for(j in 1:ncol(cor_test$r)) { - if(i != j) { # Skip diagonal - cat(colnames(numeric_vars)[i], "vs", colnames(numeric_vars)[j], - ": r =", round(cor_test$r[i, j], 3), - ", p =", round(cor_test$p[i, j], 3), "\n") - } - } -} - -# Also print significant correlations summary -print("\nSignificant correlations (p < 0.05):") -sig_cors <- which(cor_test$p < 0.05 & cor_test$p != 0, arr.ind = TRUE) -if(nrow(sig_cors) > 0) { - for(i in 1:nrow(sig_cors)) { - row_idx <- sig_cors[i, 1] - col_idx <- sig_cors[i, 2] - if(row_idx != col_idx) { # Skip diagonal - cat(colnames(numeric_vars)[row_idx], "vs", colnames(numeric_vars)[col_idx], - ": r =", round(cor_test$r[row_idx, col_idx], 3), - ", p =", round(cor_test$p[row_idx, col_idx], 3), "\n") - } - } -} else { - print("No significant correlations found.") -} - -# Save correlation matrix and p-values to CSV files -write.csv(cor_test$r, "correlation_matrix.csv", row.names = TRUE) -write.csv(cor_test$p, "correlation_pvalues.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") -print("P-values saved to correlation_pvalues.csv") \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251027120933.r b/.history/eohi1/correlation matrix_20251027120933.r deleted file mode 100644 index 156c379..0000000 --- a/.history/eohi1/correlation matrix_20251027120933.r +++ /dev/null @@ -1,111 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(eohiDGEN_mean, ehi_global_mean, sex_dummy, demo_age_1, edu_num, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print(round(cor_matrix, 3)) - -# Get significance tests for correlations using psych package -library(psych) - -# Create correlation matrix with significance tests -cor_test <- corr.test(numeric_vars, method = "spearman", adjust = "none") - -# Print correlation matrix -print("Correlation Matrix (Spearman r values):") -print(round(cor_test$r, 3)) - -# Print p-values -print("\nP-values Matrix:") -print(round(cor_test$p, 3)) - -# Print all correlations with r and p values (for reporting) -print("\nAll correlations with r and p values:") -for(i in 1:nrow(cor_test$r)) { - for(j in 1:ncol(cor_test$r)) { - if(i != j) { # Skip diagonal - cat(colnames(numeric_vars)[i], "vs", colnames(numeric_vars)[j], - ": r =", round(cor_test$r[i, j], 3), - ", p =", round(cor_test$p[i, j], 3), "\n") - } - } -} - -# Also print significant correlations summary -print("\nSignificant correlations (p < 0.05):") -sig_cors <- which(cor_test$p < 0.05 & cor_test$p != 0, arr.ind = TRUE) -if(nrow(sig_cors) > 0) { - for(i in 1:nrow(sig_cors)) { - row_idx <- sig_cors[i, 1] - col_idx <- sig_cors[i, 2] - if(row_idx != col_idx) { # Skip diagonal - cat(colnames(numeric_vars)[row_idx], "vs", colnames(numeric_vars)[col_idx], - ": r =", round(cor_test$r[row_idx, col_idx], 3), - ", p =", round(cor_test$p[row_idx, col_idx], 3), "\n") - } - } -} else { - print("No significant correlations found.") -} - -# Save correlation matrix and p-values to CSV files -write.csv(cor_test$r, "correlation_matrix.csv", row.names = TRUE) -write.csv(cor_test$p, "correlation_pvalues.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") -print("P-values saved to correlation_pvalues.csv") \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251027120955.r b/.history/eohi1/correlation matrix_20251027120955.r deleted file mode 100644 index 4b1a8a7..0000000 --- a/.history/eohi1/correlation matrix_20251027120955.r +++ /dev/null @@ -1,105 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(eohiDGEN_mean, ehi_global_mean, sex_dummy, demo_age_1, edu_num, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print(round(cor_matrix, 3)) - -# Get significance tests for correlations using psych package -library(psych) - -# Create correlation matrix with significance tests -cor_test <- corr.test(numeric_vars, method = "spearman", adjust = "none") - -# Print correlation matrix -print(round(cor_test$r, 3)) - -# Print p-values -print(round(cor_test$p, 3)) - -# Print all correlations with r and p values (for reporting) -for(i in 1:nrow(cor_test$r)) { - for(j in 1:ncol(cor_test$r)) { - if(i != j) { # Skip diagonal - cat(colnames(numeric_vars)[i], "vs", colnames(numeric_vars)[j], - ": r =", round(cor_test$r[i, j], 3), - ", p =", round(cor_test$p[i, j], 3), "\n") - } - } -} - -# Also print significant correlations summary -sig_cors <- which(cor_test$p < 0.05 & cor_test$p != 0, arr.ind = TRUE) -if(nrow(sig_cors) > 0) { - for(i in 1:nrow(sig_cors)) { - row_idx <- sig_cors[i, 1] - col_idx <- sig_cors[i, 2] - if(row_idx != col_idx) { # Skip diagonal - cat(colnames(numeric_vars)[row_idx], "vs", colnames(numeric_vars)[col_idx], - ": r =", round(cor_test$r[row_idx, col_idx], 3), - ", p =", round(cor_test$p[row_idx, col_idx], 3), "\n") - } - } -} - -# Save correlation matrix and p-values to CSV files -write.csv(cor_test$r, "correlation_matrix.csv", row.names = TRUE) -write.csv(cor_test$p, "correlation_pvalues.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") -print("P-values saved to correlation_pvalues.csv") \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251027120958.r b/.history/eohi1/correlation matrix_20251027120958.r deleted file mode 100644 index 4b1a8a7..0000000 --- a/.history/eohi1/correlation matrix_20251027120958.r +++ /dev/null @@ -1,105 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(eohiDGEN_mean, ehi_global_mean, sex_dummy, demo_age_1, edu_num, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print(round(cor_matrix, 3)) - -# Get significance tests for correlations using psych package -library(psych) - -# Create correlation matrix with significance tests -cor_test <- corr.test(numeric_vars, method = "spearman", adjust = "none") - -# Print correlation matrix -print(round(cor_test$r, 3)) - -# Print p-values -print(round(cor_test$p, 3)) - -# Print all correlations with r and p values (for reporting) -for(i in 1:nrow(cor_test$r)) { - for(j in 1:ncol(cor_test$r)) { - if(i != j) { # Skip diagonal - cat(colnames(numeric_vars)[i], "vs", colnames(numeric_vars)[j], - ": r =", round(cor_test$r[i, j], 3), - ", p =", round(cor_test$p[i, j], 3), "\n") - } - } -} - -# Also print significant correlations summary -sig_cors <- which(cor_test$p < 0.05 & cor_test$p != 0, arr.ind = TRUE) -if(nrow(sig_cors) > 0) { - for(i in 1:nrow(sig_cors)) { - row_idx <- sig_cors[i, 1] - col_idx <- sig_cors[i, 2] - if(row_idx != col_idx) { # Skip diagonal - cat(colnames(numeric_vars)[row_idx], "vs", colnames(numeric_vars)[col_idx], - ": r =", round(cor_test$r[row_idx, col_idx], 3), - ", p =", round(cor_test$p[row_idx, col_idx], 3), "\n") - } - } -} - -# Save correlation matrix and p-values to CSV files -write.csv(cor_test$r, "correlation_matrix.csv", row.names = TRUE) -write.csv(cor_test$p, "correlation_pvalues.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") -print("P-values saved to correlation_pvalues.csv") \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251027121016.r b/.history/eohi1/correlation matrix_20251027121016.r deleted file mode 100644 index 4b1a8a7..0000000 --- a/.history/eohi1/correlation matrix_20251027121016.r +++ /dev/null @@ -1,105 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(eohiDGEN_mean, ehi_global_mean, sex_dummy, demo_age_1, edu_num, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print(round(cor_matrix, 3)) - -# Get significance tests for correlations using psych package -library(psych) - -# Create correlation matrix with significance tests -cor_test <- corr.test(numeric_vars, method = "spearman", adjust = "none") - -# Print correlation matrix -print(round(cor_test$r, 3)) - -# Print p-values -print(round(cor_test$p, 3)) - -# Print all correlations with r and p values (for reporting) -for(i in 1:nrow(cor_test$r)) { - for(j in 1:ncol(cor_test$r)) { - if(i != j) { # Skip diagonal - cat(colnames(numeric_vars)[i], "vs", colnames(numeric_vars)[j], - ": r =", round(cor_test$r[i, j], 3), - ", p =", round(cor_test$p[i, j], 3), "\n") - } - } -} - -# Also print significant correlations summary -sig_cors <- which(cor_test$p < 0.05 & cor_test$p != 0, arr.ind = TRUE) -if(nrow(sig_cors) > 0) { - for(i in 1:nrow(sig_cors)) { - row_idx <- sig_cors[i, 1] - col_idx <- sig_cors[i, 2] - if(row_idx != col_idx) { # Skip diagonal - cat(colnames(numeric_vars)[row_idx], "vs", colnames(numeric_vars)[col_idx], - ": r =", round(cor_test$r[row_idx, col_idx], 3), - ", p =", round(cor_test$p[row_idx, col_idx], 3), "\n") - } - } -} - -# Save correlation matrix and p-values to CSV files -write.csv(cor_test$r, "correlation_matrix.csv", row.names = TRUE) -write.csv(cor_test$p, "correlation_pvalues.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") -print("P-values saved to correlation_pvalues.csv") \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251027134544.r b/.history/eohi1/correlation matrix_20251027134544.r deleted file mode 100644 index 4b1a8a7..0000000 --- a/.history/eohi1/correlation matrix_20251027134544.r +++ /dev/null @@ -1,105 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(eohiDGEN_mean, ehi_global_mean, sex_dummy, demo_age_1, edu_num, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print(round(cor_matrix, 3)) - -# Get significance tests for correlations using psych package -library(psych) - -# Create correlation matrix with significance tests -cor_test <- corr.test(numeric_vars, method = "spearman", adjust = "none") - -# Print correlation matrix -print(round(cor_test$r, 3)) - -# Print p-values -print(round(cor_test$p, 3)) - -# Print all correlations with r and p values (for reporting) -for(i in 1:nrow(cor_test$r)) { - for(j in 1:ncol(cor_test$r)) { - if(i != j) { # Skip diagonal - cat(colnames(numeric_vars)[i], "vs", colnames(numeric_vars)[j], - ": r =", round(cor_test$r[i, j], 3), - ", p =", round(cor_test$p[i, j], 3), "\n") - } - } -} - -# Also print significant correlations summary -sig_cors <- which(cor_test$p < 0.05 & cor_test$p != 0, arr.ind = TRUE) -if(nrow(sig_cors) > 0) { - for(i in 1:nrow(sig_cors)) { - row_idx <- sig_cors[i, 1] - col_idx <- sig_cors[i, 2] - if(row_idx != col_idx) { # Skip diagonal - cat(colnames(numeric_vars)[row_idx], "vs", colnames(numeric_vars)[col_idx], - ": r =", round(cor_test$r[row_idx, col_idx], 3), - ", p =", round(cor_test$p[row_idx, col_idx], 3), "\n") - } - } -} - -# Save correlation matrix and p-values to CSV files -write.csv(cor_test$r, "correlation_matrix.csv", row.names = TRUE) -write.csv(cor_test$p, "correlation_pvalues.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") -print("P-values saved to correlation_pvalues.csv") \ No newline at end of file diff --git a/.history/eohi1/correlation matrix_20251029115844.r b/.history/eohi1/correlation matrix_20251029115844.r deleted file mode 100644 index d19a28e..0000000 --- a/.history/eohi1/correlation matrix_20251029115844.r +++ /dev/null @@ -1,103 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(eohiDGEN_mean, ehi_global_mean, sex_dummy, demo_age_1, edu_num, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print(round(cor_matrix, 3)) - -# Get significance tests for correlations using psych package -library(psych) - -# Create correlation matrix with significance tests -cor_test <- corr.test(numeric_vars, method = "spearman", adjust = "none") - -# Print correlation matrix -print(round(cor_test$r, 3)) - -# Print p-values -print(round(cor_test$p, 3)) - -# Print all correlations with r and p values (for reporting) -for(i in 1:nrow(cor_test$r)) { - for(j in 1:ncol(cor_test$r)) { - if(i != j) { # Skip diagonal - cat(colnames(numeric_vars)[i], "vs", colnames(numeric_vars)[j], - ": r =", round(cor_test$r[i, j], 3), - ", p =", round(cor_test$p[i, j], 3), "\n") - } - } -} - -# Also print significant correlations summary -sig_cors <- which(cor_test$p < 0.05 & cor_test$p != 0, arr.ind = TRUE) -if(nrow(sig_cors) > 0) { - for(i in 1:nrow(sig_cors)) { - row_idx <- sig_cors[i, 1] - col_idx <- sig_cors[i, 2] - if(row_idx != col_idx) { # Skip diagonal - cat(colnames(numeric_vars)[row_idx], "vs", colnames(numeric_vars)[col_idx], - ": r =", round(cor_test$r[row_idx, col_idx], 3), - ", p =", round(cor_test$p[row_idx, col_idx], 3), "\n") - } - } -} - -# Save correlation matrix and p-values to CSV files -write.csv(cor_test$r, "correlation_matrix.csv", row.names = TRUE) -write.csv(cor_test$p, "correlation_pvalues.csv", row.names = TRUE) diff --git a/.history/eohi1/correlations - brier score x eohi and cal_20250922132833.r b/.history/eohi1/correlations - brier score x eohi and cal_20250922132833.r deleted file mode 100644 index deade33..0000000 --- a/.history/eohi1/correlations - brier score x eohi and cal_20250922132833.r +++ /dev/null @@ -1,67 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Remove columns with all NA values -df1 <- df1 %>% select(where(~ !all(is.na(.)))) - -# --- Brier score correlations vs EOHIs and Calibration --- - -# Variables -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -# Helper: tidy correlation (Pearson), pairwise complete -corr_tidy <- function(df, x_vars, y_vars) { - grid <- expand.grid(x = x_vars, y = y_vars, stringsAsFactors = FALSE) - results <- purrr::pmap_dfr(grid, function(x, y) { - xv <- df[[x]]; yv <- df[[y]] - ok <- is.finite(xv) & is.finite(yv) - if (sum(ok) < 3) { - return(tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = NA_real_, p = NA_real_)) - } - ct <- suppressWarnings(cor.test(xv[ok], yv[ok], method = "pearson")) - tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = unname(ct$estimate), p = ct$p.value) - }) - dplyr::arrange(results, var_x, var_y) -} - -# Compute correlations -corr_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars) -corr_bs_cal <- corr_tidy(df1, bs_vars, cal_vars) - -# Wide r-only tables (optional) -to_wide <- function(d) { - tidyr::pivot_wider(d, id_cols = var_x, names_from = var_y, values_from = r) -} -wide_bs_eohi <- to_wide(corr_bs_eohi) -wide_bs_cal <- to_wide(corr_bs_cal) - -# Display -print("Correlations: Brier vs EOHIs (r, p, n)") -print(corr_bs_eohi) -print("Correlations: Brier vs Calibration (r, p, n)") -print(corr_bs_cal) - -# If you want to export CSVs, uncomment: -# write.csv(corr_bs_eohi, "corr_bs_eohi.csv", row.names = FALSE) -# write.csv(corr_bs_cal, "corr_bs_cal.csv", row.names = FALSE) -# write.csv(wide_bs_eohi, "corr_bs_eohi_wide.csv", row.names = FALSE) -# write.csv(wide_bs_cal, "corr_bs_cal_wide.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - brier score x eohi and cal_20250922132837.r b/.history/eohi1/correlations - brier score x eohi and cal_20250922132837.r deleted file mode 100644 index deade33..0000000 --- a/.history/eohi1/correlations - brier score x eohi and cal_20250922132837.r +++ /dev/null @@ -1,67 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Remove columns with all NA values -df1 <- df1 %>% select(where(~ !all(is.na(.)))) - -# --- Brier score correlations vs EOHIs and Calibration --- - -# Variables -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -# Helper: tidy correlation (Pearson), pairwise complete -corr_tidy <- function(df, x_vars, y_vars) { - grid <- expand.grid(x = x_vars, y = y_vars, stringsAsFactors = FALSE) - results <- purrr::pmap_dfr(grid, function(x, y) { - xv <- df[[x]]; yv <- df[[y]] - ok <- is.finite(xv) & is.finite(yv) - if (sum(ok) < 3) { - return(tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = NA_real_, p = NA_real_)) - } - ct <- suppressWarnings(cor.test(xv[ok], yv[ok], method = "pearson")) - tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = unname(ct$estimate), p = ct$p.value) - }) - dplyr::arrange(results, var_x, var_y) -} - -# Compute correlations -corr_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars) -corr_bs_cal <- corr_tidy(df1, bs_vars, cal_vars) - -# Wide r-only tables (optional) -to_wide <- function(d) { - tidyr::pivot_wider(d, id_cols = var_x, names_from = var_y, values_from = r) -} -wide_bs_eohi <- to_wide(corr_bs_eohi) -wide_bs_cal <- to_wide(corr_bs_cal) - -# Display -print("Correlations: Brier vs EOHIs (r, p, n)") -print(corr_bs_eohi) -print("Correlations: Brier vs Calibration (r, p, n)") -print(corr_bs_cal) - -# If you want to export CSVs, uncomment: -# write.csv(corr_bs_eohi, "corr_bs_eohi.csv", row.names = FALSE) -# write.csv(corr_bs_cal, "corr_bs_cal.csv", row.names = FALSE) -# write.csv(wide_bs_eohi, "corr_bs_eohi_wide.csv", row.names = FALSE) -# write.csv(wide_bs_cal, "corr_bs_cal_wide.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - brier score x eohi and cal_20250922132923.r b/.history/eohi1/correlations - brier score x eohi and cal_20250922132923.r deleted file mode 100644 index 6132380..0000000 --- a/.history/eohi1/correlations - brier score x eohi and cal_20250922132923.r +++ /dev/null @@ -1,74 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Keep only required columns for the analysis -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -df1 <- df1 %>% dplyr::select(dplyr::all_of(c(bs_vars, eohi_vars, cal_vars))) - -# --- Brier score correlations vs EOHIs and Calibration --- - -# Variables -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -# Helper: tidy correlation (Pearson), pairwise complete -corr_tidy <- function(df, x_vars, y_vars) { - grid <- expand.grid(x = x_vars, y = y_vars, stringsAsFactors = FALSE) - results <- purrr::pmap_dfr(grid, function(x, y) { - xv <- df[[x]]; yv <- df[[y]] - ok <- is.finite(xv) & is.finite(yv) - if (sum(ok) < 3) { - return(tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = NA_real_, p = NA_real_)) - } - ct <- suppressWarnings(cor.test(xv[ok], yv[ok], method = "pearson")) - tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = unname(ct$estimate), p = ct$p.value) - }) - dplyr::arrange(results, var_x, var_y) -} - -# Compute correlations -corr_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars) -corr_bs_cal <- corr_tidy(df1, bs_vars, cal_vars) - -# Wide r-only tables (optional) -to_wide <- function(d) { - tidyr::pivot_wider(d, id_cols = var_x, names_from = var_y, values_from = r) -} -wide_bs_eohi <- to_wide(corr_bs_eohi) -wide_bs_cal <- to_wide(corr_bs_cal) - -# Display -print("Correlations: Brier vs EOHIs (r, p, n)") -print(corr_bs_eohi) -print("Correlations: Brier vs Calibration (r, p, n)") -print(corr_bs_cal) - -# If you want to export CSVs, uncomment: -# write.csv(corr_bs_eohi, "corr_bs_eohi.csv", row.names = FALSE) -# write.csv(corr_bs_cal, "corr_bs_cal.csv", row.names = FALSE) -# write.csv(wide_bs_eohi, "corr_bs_eohi_wide.csv", row.names = FALSE) -# write.csv(wide_bs_cal, "corr_bs_cal_wide.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - brier score x eohi and cal_20250922132932.r b/.history/eohi1/correlations - brier score x eohi and cal_20250922132932.r deleted file mode 100644 index 6132380..0000000 --- a/.history/eohi1/correlations - brier score x eohi and cal_20250922132932.r +++ /dev/null @@ -1,74 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Keep only required columns for the analysis -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -df1 <- df1 %>% dplyr::select(dplyr::all_of(c(bs_vars, eohi_vars, cal_vars))) - -# --- Brier score correlations vs EOHIs and Calibration --- - -# Variables -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -# Helper: tidy correlation (Pearson), pairwise complete -corr_tidy <- function(df, x_vars, y_vars) { - grid <- expand.grid(x = x_vars, y = y_vars, stringsAsFactors = FALSE) - results <- purrr::pmap_dfr(grid, function(x, y) { - xv <- df[[x]]; yv <- df[[y]] - ok <- is.finite(xv) & is.finite(yv) - if (sum(ok) < 3) { - return(tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = NA_real_, p = NA_real_)) - } - ct <- suppressWarnings(cor.test(xv[ok], yv[ok], method = "pearson")) - tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = unname(ct$estimate), p = ct$p.value) - }) - dplyr::arrange(results, var_x, var_y) -} - -# Compute correlations -corr_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars) -corr_bs_cal <- corr_tidy(df1, bs_vars, cal_vars) - -# Wide r-only tables (optional) -to_wide <- function(d) { - tidyr::pivot_wider(d, id_cols = var_x, names_from = var_y, values_from = r) -} -wide_bs_eohi <- to_wide(corr_bs_eohi) -wide_bs_cal <- to_wide(corr_bs_cal) - -# Display -print("Correlations: Brier vs EOHIs (r, p, n)") -print(corr_bs_eohi) -print("Correlations: Brier vs Calibration (r, p, n)") -print(corr_bs_cal) - -# If you want to export CSVs, uncomment: -# write.csv(corr_bs_eohi, "corr_bs_eohi.csv", row.names = FALSE) -# write.csv(corr_bs_cal, "corr_bs_cal.csv", row.names = FALSE) -# write.csv(wide_bs_eohi, "corr_bs_eohi_wide.csv", row.names = FALSE) -# write.csv(wide_bs_cal, "corr_bs_cal_wide.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - brier score x eohi and cal_20250922132938.r b/.history/eohi1/correlations - brier score x eohi and cal_20250922132938.r deleted file mode 100644 index 6132380..0000000 --- a/.history/eohi1/correlations - brier score x eohi and cal_20250922132938.r +++ /dev/null @@ -1,74 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Keep only required columns for the analysis -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -df1 <- df1 %>% dplyr::select(dplyr::all_of(c(bs_vars, eohi_vars, cal_vars))) - -# --- Brier score correlations vs EOHIs and Calibration --- - -# Variables -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -# Helper: tidy correlation (Pearson), pairwise complete -corr_tidy <- function(df, x_vars, y_vars) { - grid <- expand.grid(x = x_vars, y = y_vars, stringsAsFactors = FALSE) - results <- purrr::pmap_dfr(grid, function(x, y) { - xv <- df[[x]]; yv <- df[[y]] - ok <- is.finite(xv) & is.finite(yv) - if (sum(ok) < 3) { - return(tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = NA_real_, p = NA_real_)) - } - ct <- suppressWarnings(cor.test(xv[ok], yv[ok], method = "pearson")) - tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = unname(ct$estimate), p = ct$p.value) - }) - dplyr::arrange(results, var_x, var_y) -} - -# Compute correlations -corr_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars) -corr_bs_cal <- corr_tidy(df1, bs_vars, cal_vars) - -# Wide r-only tables (optional) -to_wide <- function(d) { - tidyr::pivot_wider(d, id_cols = var_x, names_from = var_y, values_from = r) -} -wide_bs_eohi <- to_wide(corr_bs_eohi) -wide_bs_cal <- to_wide(corr_bs_cal) - -# Display -print("Correlations: Brier vs EOHIs (r, p, n)") -print(corr_bs_eohi) -print("Correlations: Brier vs Calibration (r, p, n)") -print(corr_bs_cal) - -# If you want to export CSVs, uncomment: -# write.csv(corr_bs_eohi, "corr_bs_eohi.csv", row.names = FALSE) -# write.csv(corr_bs_cal, "corr_bs_cal.csv", row.names = FALSE) -# write.csv(wide_bs_eohi, "corr_bs_eohi_wide.csv", row.names = FALSE) -# write.csv(wide_bs_cal, "corr_bs_cal_wide.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - brier score x eohi and cal_20250922133020.r b/.history/eohi1/correlations - brier score x eohi and cal_20250922133020.r deleted file mode 100644 index 13dfc16..0000000 --- a/.history/eohi1/correlations - brier score x eohi and cal_20250922133020.r +++ /dev/null @@ -1,75 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Keep only required columns for the analysis -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -df1 <- df1 %>% dplyr::select(dplyr::all_of(c(bs_vars, eohi_vars, cal_vars))) - -# --- Brier score correlations vs EOHIs and Calibration --- - -# Variables -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -# Helper: tidy correlation (Pearson), pairwise complete -corr_tidy <- function(df, x_vars, y_vars) { - grid <- expand.grid(x = x_vars, y = y_vars, stringsAsFactors = FALSE) - results <- purrr::pmap_dfr(grid, function(x, y) { - xv <- df[[x]]; yv <- df[[y]] - ok <- is.finite(xv) & is.finite(yv) - if (sum(ok) < 3) { - return(tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = NA_real_, p = NA_real_)) - } - ct <- suppressWarnings(cor.test(xv[ok], yv[ok], method = "pearson")) - tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = unname(ct$estimate), p = ct$p.value) - }) - dplyr::arrange(results, var_x, var_y) -} - -# Compute correlations -corr_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars) -corr_bs_cal <- corr_tidy(df1, bs_vars, cal_vars) - -# Wide r-only tables (optional) -to_wide <- function(d) { - tidyr::pivot_wider(d, id_cols = var_x, names_from = var_y, values_from = r) -} -wide_bs_eohi <- to_wide(corr_bs_eohi) -wide_bs_cal <- to_wide(corr_bs_cal) - -# Display -print("Correlations: Brier vs EOHIs (r, p, n)") -print(corr_bs_eohi) -print("Correlations: Brier vs Calibration (r, p, n)") -print(corr_bs_cal) - -# Export a single CSV combining both sets -corr_bs_eohi$group <- "EOHI" -corr_bs_cal$group <- "Calibration" -corr_all <- dplyr::bind_rows(corr_bs_eohi, corr_bs_cal) %>% - dplyr::relocate(group, .before = var_x) -write.csv(corr_all, "corr_bs_all.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - brier score x eohi and cal_20250922133026.r b/.history/eohi1/correlations - brier score x eohi and cal_20250922133026.r deleted file mode 100644 index 13dfc16..0000000 --- a/.history/eohi1/correlations - brier score x eohi and cal_20250922133026.r +++ /dev/null @@ -1,75 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Keep only required columns for the analysis -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -df1 <- df1 %>% dplyr::select(dplyr::all_of(c(bs_vars, eohi_vars, cal_vars))) - -# --- Brier score correlations vs EOHIs and Calibration --- - -# Variables -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -# Helper: tidy correlation (Pearson), pairwise complete -corr_tidy <- function(df, x_vars, y_vars) { - grid <- expand.grid(x = x_vars, y = y_vars, stringsAsFactors = FALSE) - results <- purrr::pmap_dfr(grid, function(x, y) { - xv <- df[[x]]; yv <- df[[y]] - ok <- is.finite(xv) & is.finite(yv) - if (sum(ok) < 3) { - return(tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = NA_real_, p = NA_real_)) - } - ct <- suppressWarnings(cor.test(xv[ok], yv[ok], method = "pearson")) - tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = unname(ct$estimate), p = ct$p.value) - }) - dplyr::arrange(results, var_x, var_y) -} - -# Compute correlations -corr_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars) -corr_bs_cal <- corr_tidy(df1, bs_vars, cal_vars) - -# Wide r-only tables (optional) -to_wide <- function(d) { - tidyr::pivot_wider(d, id_cols = var_x, names_from = var_y, values_from = r) -} -wide_bs_eohi <- to_wide(corr_bs_eohi) -wide_bs_cal <- to_wide(corr_bs_cal) - -# Display -print("Correlations: Brier vs EOHIs (r, p, n)") -print(corr_bs_eohi) -print("Correlations: Brier vs Calibration (r, p, n)") -print(corr_bs_cal) - -# Export a single CSV combining both sets -corr_bs_eohi$group <- "EOHI" -corr_bs_cal$group <- "Calibration" -corr_all <- dplyr::bind_rows(corr_bs_eohi, corr_bs_cal) %>% - dplyr::relocate(group, .before = var_x) -write.csv(corr_all, "corr_bs_all.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - brier score x eohi and cal_20250922133028.r b/.history/eohi1/correlations - brier score x eohi and cal_20250922133028.r deleted file mode 100644 index 13dfc16..0000000 --- a/.history/eohi1/correlations - brier score x eohi and cal_20250922133028.r +++ /dev/null @@ -1,75 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Keep only required columns for the analysis -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -df1 <- df1 %>% dplyr::select(dplyr::all_of(c(bs_vars, eohi_vars, cal_vars))) - -# --- Brier score correlations vs EOHIs and Calibration --- - -# Variables -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -# Helper: tidy correlation (Pearson), pairwise complete -corr_tidy <- function(df, x_vars, y_vars) { - grid <- expand.grid(x = x_vars, y = y_vars, stringsAsFactors = FALSE) - results <- purrr::pmap_dfr(grid, function(x, y) { - xv <- df[[x]]; yv <- df[[y]] - ok <- is.finite(xv) & is.finite(yv) - if (sum(ok) < 3) { - return(tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = NA_real_, p = NA_real_)) - } - ct <- suppressWarnings(cor.test(xv[ok], yv[ok], method = "pearson")) - tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = unname(ct$estimate), p = ct$p.value) - }) - dplyr::arrange(results, var_x, var_y) -} - -# Compute correlations -corr_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars) -corr_bs_cal <- corr_tidy(df1, bs_vars, cal_vars) - -# Wide r-only tables (optional) -to_wide <- function(d) { - tidyr::pivot_wider(d, id_cols = var_x, names_from = var_y, values_from = r) -} -wide_bs_eohi <- to_wide(corr_bs_eohi) -wide_bs_cal <- to_wide(corr_bs_cal) - -# Display -print("Correlations: Brier vs EOHIs (r, p, n)") -print(corr_bs_eohi) -print("Correlations: Brier vs Calibration (r, p, n)") -print(corr_bs_cal) - -# Export a single CSV combining both sets -corr_bs_eohi$group <- "EOHI" -corr_bs_cal$group <- "Calibration" -corr_all <- dplyr::bind_rows(corr_bs_eohi, corr_bs_cal) %>% - dplyr::relocate(group, .before = var_x) -write.csv(corr_all, "corr_bs_all.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - brier score x eohi and cal_20250922133336.r b/.history/eohi1/correlations - brier score x eohi and cal_20250922133336.r deleted file mode 100644 index 9f32ebe..0000000 --- a/.history/eohi1/correlations - brier score x eohi and cal_20250922133336.r +++ /dev/null @@ -1,86 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Keep only required columns for the analysis -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -df1 <- df1 %>% dplyr::select(dplyr::all_of(c(bs_vars, eohi_vars, cal_vars))) - -# --- Brier score correlations vs EOHIs and Calibration --- - -# Variables -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -# Helper: tidy correlation (method = "pearson" or "spearman"), pairwise complete -corr_tidy <- function(df, x_vars, y_vars, method = "pearson") { - grid <- expand.grid(x = x_vars, y = y_vars, stringsAsFactors = FALSE) - results <- purrr::pmap_dfr(grid, function(x, y) { - xv <- df[[x]]; yv <- df[[y]] - ok <- is.finite(xv) & is.finite(yv) - if (sum(ok) < 3) { - return(tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = NA_real_, p = NA_real_, method = method)) - } - ct <- suppressWarnings(cor.test(xv[ok], yv[ok], method = method)) - tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = unname(ct$estimate), p = ct$p.value, method = method) - }) - dplyr::arrange(results, var_x, var_y) -} - -# Compute correlations (Pearson) -corr_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars, method = "pearson") -corr_bs_cal <- corr_tidy(df1, bs_vars, cal_vars, method = "pearson") - -# Compute correlations (Spearman) -corr_s_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars, method = "spearman") -corr_s_bs_cal <- corr_tidy(df1, bs_vars, cal_vars, method = "spearman") - -# Wide r-only tables (optional) -to_wide <- function(d) { - tidyr::pivot_wider(d, id_cols = var_x, names_from = var_y, values_from = r) -} -wide_bs_eohi <- to_wide(corr_bs_eohi) -wide_bs_cal <- to_wide(corr_bs_cal) - -# Display -print("Correlations: Brier vs EOHIs (Pearson r, p, n)") -print(corr_bs_eohi) -print("Correlations: Brier vs Calibration (Pearson r, p, n)") -print(corr_bs_cal) -print("Correlations: Brier vs EOHIs (Spearman rho, p, n)") -print(corr_s_bs_eohi) -print("Correlations: Brier vs Calibration (Spearman rho, p, n)") -print(corr_s_bs_cal) - -# Export a single CSV combining both sets -corr_bs_eohi$group <- "EOHI" -corr_bs_cal$group <- "Calibration" -corr_s_bs_eohi$group <- "EOHI" -corr_s_bs_cal$group <- "Calibration" - -corr_all <- dplyr::bind_rows(corr_bs_eohi, corr_bs_cal, corr_s_bs_eohi, corr_s_bs_cal) %>% - dplyr::relocate(group, .before = var_x) -write.csv(corr_all, "corr_bs_all.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - brier score x eohi and cal_20250922133339.r b/.history/eohi1/correlations - brier score x eohi and cal_20250922133339.r deleted file mode 100644 index 9f32ebe..0000000 --- a/.history/eohi1/correlations - brier score x eohi and cal_20250922133339.r +++ /dev/null @@ -1,86 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Keep only required columns for the analysis -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -df1 <- df1 %>% dplyr::select(dplyr::all_of(c(bs_vars, eohi_vars, cal_vars))) - -# --- Brier score correlations vs EOHIs and Calibration --- - -# Variables -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -# Helper: tidy correlation (method = "pearson" or "spearman"), pairwise complete -corr_tidy <- function(df, x_vars, y_vars, method = "pearson") { - grid <- expand.grid(x = x_vars, y = y_vars, stringsAsFactors = FALSE) - results <- purrr::pmap_dfr(grid, function(x, y) { - xv <- df[[x]]; yv <- df[[y]] - ok <- is.finite(xv) & is.finite(yv) - if (sum(ok) < 3) { - return(tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = NA_real_, p = NA_real_, method = method)) - } - ct <- suppressWarnings(cor.test(xv[ok], yv[ok], method = method)) - tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = unname(ct$estimate), p = ct$p.value, method = method) - }) - dplyr::arrange(results, var_x, var_y) -} - -# Compute correlations (Pearson) -corr_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars, method = "pearson") -corr_bs_cal <- corr_tidy(df1, bs_vars, cal_vars, method = "pearson") - -# Compute correlations (Spearman) -corr_s_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars, method = "spearman") -corr_s_bs_cal <- corr_tidy(df1, bs_vars, cal_vars, method = "spearman") - -# Wide r-only tables (optional) -to_wide <- function(d) { - tidyr::pivot_wider(d, id_cols = var_x, names_from = var_y, values_from = r) -} -wide_bs_eohi <- to_wide(corr_bs_eohi) -wide_bs_cal <- to_wide(corr_bs_cal) - -# Display -print("Correlations: Brier vs EOHIs (Pearson r, p, n)") -print(corr_bs_eohi) -print("Correlations: Brier vs Calibration (Pearson r, p, n)") -print(corr_bs_cal) -print("Correlations: Brier vs EOHIs (Spearman rho, p, n)") -print(corr_s_bs_eohi) -print("Correlations: Brier vs Calibration (Spearman rho, p, n)") -print(corr_s_bs_cal) - -# Export a single CSV combining both sets -corr_bs_eohi$group <- "EOHI" -corr_bs_cal$group <- "Calibration" -corr_s_bs_eohi$group <- "EOHI" -corr_s_bs_cal$group <- "Calibration" - -corr_all <- dplyr::bind_rows(corr_bs_eohi, corr_bs_cal, corr_s_bs_eohi, corr_s_bs_cal) %>% - dplyr::relocate(group, .before = var_x) -write.csv(corr_all, "corr_bs_all.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - brier score x eohi and cal_20250922133341.r b/.history/eohi1/correlations - brier score x eohi and cal_20250922133341.r deleted file mode 100644 index 9f32ebe..0000000 --- a/.history/eohi1/correlations - brier score x eohi and cal_20250922133341.r +++ /dev/null @@ -1,86 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Keep only required columns for the analysis -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -df1 <- df1 %>% dplyr::select(dplyr::all_of(c(bs_vars, eohi_vars, cal_vars))) - -# --- Brier score correlations vs EOHIs and Calibration --- - -# Variables -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -# Helper: tidy correlation (method = "pearson" or "spearman"), pairwise complete -corr_tidy <- function(df, x_vars, y_vars, method = "pearson") { - grid <- expand.grid(x = x_vars, y = y_vars, stringsAsFactors = FALSE) - results <- purrr::pmap_dfr(grid, function(x, y) { - xv <- df[[x]]; yv <- df[[y]] - ok <- is.finite(xv) & is.finite(yv) - if (sum(ok) < 3) { - return(tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = NA_real_, p = NA_real_, method = method)) - } - ct <- suppressWarnings(cor.test(xv[ok], yv[ok], method = method)) - tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = unname(ct$estimate), p = ct$p.value, method = method) - }) - dplyr::arrange(results, var_x, var_y) -} - -# Compute correlations (Pearson) -corr_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars, method = "pearson") -corr_bs_cal <- corr_tidy(df1, bs_vars, cal_vars, method = "pearson") - -# Compute correlations (Spearman) -corr_s_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars, method = "spearman") -corr_s_bs_cal <- corr_tidy(df1, bs_vars, cal_vars, method = "spearman") - -# Wide r-only tables (optional) -to_wide <- function(d) { - tidyr::pivot_wider(d, id_cols = var_x, names_from = var_y, values_from = r) -} -wide_bs_eohi <- to_wide(corr_bs_eohi) -wide_bs_cal <- to_wide(corr_bs_cal) - -# Display -print("Correlations: Brier vs EOHIs (Pearson r, p, n)") -print(corr_bs_eohi) -print("Correlations: Brier vs Calibration (Pearson r, p, n)") -print(corr_bs_cal) -print("Correlations: Brier vs EOHIs (Spearman rho, p, n)") -print(corr_s_bs_eohi) -print("Correlations: Brier vs Calibration (Spearman rho, p, n)") -print(corr_s_bs_cal) - -# Export a single CSV combining both sets -corr_bs_eohi$group <- "EOHI" -corr_bs_cal$group <- "Calibration" -corr_s_bs_eohi$group <- "EOHI" -corr_s_bs_cal$group <- "Calibration" - -corr_all <- dplyr::bind_rows(corr_bs_eohi, corr_bs_cal, corr_s_bs_eohi, corr_s_bs_cal) %>% - dplyr::relocate(group, .before = var_x) -write.csv(corr_all, "corr_bs_all.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - brier score x eohi and cal_20250922134044.r b/.history/eohi1/correlations - brier score x eohi and cal_20250922134044.r deleted file mode 100644 index 9f32ebe..0000000 --- a/.history/eohi1/correlations - brier score x eohi and cal_20250922134044.r +++ /dev/null @@ -1,86 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Keep only required columns for the analysis -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -df1 <- df1 %>% dplyr::select(dplyr::all_of(c(bs_vars, eohi_vars, cal_vars))) - -# --- Brier score correlations vs EOHIs and Calibration --- - -# Variables -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -# Helper: tidy correlation (method = "pearson" or "spearman"), pairwise complete -corr_tidy <- function(df, x_vars, y_vars, method = "pearson") { - grid <- expand.grid(x = x_vars, y = y_vars, stringsAsFactors = FALSE) - results <- purrr::pmap_dfr(grid, function(x, y) { - xv <- df[[x]]; yv <- df[[y]] - ok <- is.finite(xv) & is.finite(yv) - if (sum(ok) < 3) { - return(tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = NA_real_, p = NA_real_, method = method)) - } - ct <- suppressWarnings(cor.test(xv[ok], yv[ok], method = method)) - tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = unname(ct$estimate), p = ct$p.value, method = method) - }) - dplyr::arrange(results, var_x, var_y) -} - -# Compute correlations (Pearson) -corr_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars, method = "pearson") -corr_bs_cal <- corr_tidy(df1, bs_vars, cal_vars, method = "pearson") - -# Compute correlations (Spearman) -corr_s_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars, method = "spearman") -corr_s_bs_cal <- corr_tidy(df1, bs_vars, cal_vars, method = "spearman") - -# Wide r-only tables (optional) -to_wide <- function(d) { - tidyr::pivot_wider(d, id_cols = var_x, names_from = var_y, values_from = r) -} -wide_bs_eohi <- to_wide(corr_bs_eohi) -wide_bs_cal <- to_wide(corr_bs_cal) - -# Display -print("Correlations: Brier vs EOHIs (Pearson r, p, n)") -print(corr_bs_eohi) -print("Correlations: Brier vs Calibration (Pearson r, p, n)") -print(corr_bs_cal) -print("Correlations: Brier vs EOHIs (Spearman rho, p, n)") -print(corr_s_bs_eohi) -print("Correlations: Brier vs Calibration (Spearman rho, p, n)") -print(corr_s_bs_cal) - -# Export a single CSV combining both sets -corr_bs_eohi$group <- "EOHI" -corr_bs_cal$group <- "Calibration" -corr_s_bs_eohi$group <- "EOHI" -corr_s_bs_cal$group <- "Calibration" - -corr_all <- dplyr::bind_rows(corr_bs_eohi, corr_bs_cal, corr_s_bs_eohi, corr_s_bs_cal) %>% - dplyr::relocate(group, .before = var_x) -write.csv(corr_all, "corr_bs_all.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - brier score x eohi and cal_20250922135638.r b/.history/eohi1/correlations - brier score x eohi and cal_20250922135638.r deleted file mode 100644 index 9f32ebe..0000000 --- a/.history/eohi1/correlations - brier score x eohi and cal_20250922135638.r +++ /dev/null @@ -1,86 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Keep only required columns for the analysis -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -df1 <- df1 %>% dplyr::select(dplyr::all_of(c(bs_vars, eohi_vars, cal_vars))) - -# --- Brier score correlations vs EOHIs and Calibration --- - -# Variables -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -# Helper: tidy correlation (method = "pearson" or "spearman"), pairwise complete -corr_tidy <- function(df, x_vars, y_vars, method = "pearson") { - grid <- expand.grid(x = x_vars, y = y_vars, stringsAsFactors = FALSE) - results <- purrr::pmap_dfr(grid, function(x, y) { - xv <- df[[x]]; yv <- df[[y]] - ok <- is.finite(xv) & is.finite(yv) - if (sum(ok) < 3) { - return(tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = NA_real_, p = NA_real_, method = method)) - } - ct <- suppressWarnings(cor.test(xv[ok], yv[ok], method = method)) - tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = unname(ct$estimate), p = ct$p.value, method = method) - }) - dplyr::arrange(results, var_x, var_y) -} - -# Compute correlations (Pearson) -corr_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars, method = "pearson") -corr_bs_cal <- corr_tidy(df1, bs_vars, cal_vars, method = "pearson") - -# Compute correlations (Spearman) -corr_s_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars, method = "spearman") -corr_s_bs_cal <- corr_tidy(df1, bs_vars, cal_vars, method = "spearman") - -# Wide r-only tables (optional) -to_wide <- function(d) { - tidyr::pivot_wider(d, id_cols = var_x, names_from = var_y, values_from = r) -} -wide_bs_eohi <- to_wide(corr_bs_eohi) -wide_bs_cal <- to_wide(corr_bs_cal) - -# Display -print("Correlations: Brier vs EOHIs (Pearson r, p, n)") -print(corr_bs_eohi) -print("Correlations: Brier vs Calibration (Pearson r, p, n)") -print(corr_bs_cal) -print("Correlations: Brier vs EOHIs (Spearman rho, p, n)") -print(corr_s_bs_eohi) -print("Correlations: Brier vs Calibration (Spearman rho, p, n)") -print(corr_s_bs_cal) - -# Export a single CSV combining both sets -corr_bs_eohi$group <- "EOHI" -corr_bs_cal$group <- "Calibration" -corr_s_bs_eohi$group <- "EOHI" -corr_s_bs_cal$group <- "Calibration" - -corr_all <- dplyr::bind_rows(corr_bs_eohi, corr_bs_cal, corr_s_bs_eohi, corr_s_bs_cal) %>% - dplyr::relocate(group, .before = var_x) -write.csv(corr_all, "corr_bs_all.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - brier score x eohi and cal_20251008160336.r b/.history/eohi1/correlations - brier score x eohi and cal_20251008160336.r deleted file mode 100644 index 9591a6f..0000000 --- a/.history/eohi1/correlations - brier score x eohi and cal_20251008160336.r +++ /dev/null @@ -1,86 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("ehi1.csv") - -# Keep only required columns for the analysis -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -df1 <- df1 %>% dplyr::select(dplyr::all_of(c(bs_vars, eohi_vars, cal_vars))) - -# --- Brier score correlations vs EOHIs and Calibration --- - -# Variables -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -# Helper: tidy correlation (method = "pearson" or "spearman"), pairwise complete -corr_tidy <- function(df, x_vars, y_vars, method = "pearson") { - grid <- expand.grid(x = x_vars, y = y_vars, stringsAsFactors = FALSE) - results <- purrr::pmap_dfr(grid, function(x, y) { - xv <- df[[x]]; yv <- df[[y]] - ok <- is.finite(xv) & is.finite(yv) - if (sum(ok) < 3) { - return(tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = NA_real_, p = NA_real_, method = method)) - } - ct <- suppressWarnings(cor.test(xv[ok], yv[ok], method = method)) - tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = unname(ct$estimate), p = ct$p.value, method = method) - }) - dplyr::arrange(results, var_x, var_y) -} - -# Compute correlations (Pearson) -corr_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars, method = "pearson") -corr_bs_cal <- corr_tidy(df1, bs_vars, cal_vars, method = "pearson") - -# Compute correlations (Spearman) -corr_s_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars, method = "spearman") -corr_s_bs_cal <- corr_tidy(df1, bs_vars, cal_vars, method = "spearman") - -# Wide r-only tables (optional) -to_wide <- function(d) { - tidyr::pivot_wider(d, id_cols = var_x, names_from = var_y, values_from = r) -} -wide_bs_eohi <- to_wide(corr_bs_eohi) -wide_bs_cal <- to_wide(corr_bs_cal) - -# Display -print("Correlations: Brier vs EOHIs (Pearson r, p, n)") -print(corr_bs_eohi) -print("Correlations: Brier vs Calibration (Pearson r, p, n)") -print(corr_bs_cal) -print("Correlations: Brier vs EOHIs (Spearman rho, p, n)") -print(corr_s_bs_eohi) -print("Correlations: Brier vs Calibration (Spearman rho, p, n)") -print(corr_s_bs_cal) - -# Export a single CSV combining both sets -corr_bs_eohi$group <- "EOHI" -corr_bs_cal$group <- "Calibration" -corr_s_bs_eohi$group <- "EOHI" -corr_s_bs_cal$group <- "Calibration" - -corr_all <- dplyr::bind_rows(corr_bs_eohi, corr_bs_cal, corr_s_bs_eohi, corr_s_bs_cal) %>% - dplyr::relocate(group, .before = var_x) -write.csv(corr_all, "corr_bs_all.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - brier score x eohi and cal_20251008160455.r b/.history/eohi1/correlations - brier score x eohi and cal_20251008160455.r deleted file mode 100644 index f471649..0000000 --- a/.history/eohi1/correlations - brier score x eohi and cal_20251008160455.r +++ /dev/null @@ -1,86 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("ehi1.csv") - -# Keep only required columns for the analysis -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -df1 <- df1 %>% dplyr::select(dplyr::all_of(c(bs_vars, eohi_vars, cal_vars))) - -# --- Brier score correlations vs EOHIs and Calibration --- - -# Variables -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohi_pref","eohi_pers","eohi_val","eohi_life","eohi_mean", - "eohiDGEN_pref","eohiDGEN_pers","eohiDGEN_val","eohiDGEN_life","eohiDGEN_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -# Helper: tidy correlation (method = "pearson" or "spearman"), pairwise complete -corr_tidy <- function(df, x_vars, y_vars, method = "pearson") { - grid <- expand.grid(x = x_vars, y = y_vars, stringsAsFactors = FALSE) - results <- purrr::pmap_dfr(grid, function(x, y) { - xv <- df[[x]]; yv <- df[[y]] - ok <- is.finite(xv) & is.finite(yv) - if (sum(ok) < 3) { - return(tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = NA_real_, p = NA_real_, method = method)) - } - ct <- suppressWarnings(cor.test(xv[ok], yv[ok], method = method)) - tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = unname(ct$estimate), p = ct$p.value, method = method) - }) - dplyr::arrange(results, var_x, var_y) -} - -# Compute correlations (Pearson) -corr_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars, method = "pearson") -corr_bs_cal <- corr_tidy(df1, bs_vars, cal_vars, method = "pearson") - -# Compute correlations (Spearman) -corr_s_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars, method = "spearman") -corr_s_bs_cal <- corr_tidy(df1, bs_vars, cal_vars, method = "spearman") - -# Wide r-only tables (optional) -to_wide <- function(d) { - tidyr::pivot_wider(d, id_cols = var_x, names_from = var_y, values_from = r) -} -wide_bs_eohi <- to_wide(corr_bs_eohi) -wide_bs_cal <- to_wide(corr_bs_cal) - -# Display -print("Correlations: Brier vs EOHIs (Pearson r, p, n)") -print(corr_bs_eohi) -print("Correlations: Brier vs Calibration (Pearson r, p, n)") -print(corr_bs_cal) -print("Correlations: Brier vs EOHIs (Spearman rho, p, n)") -print(corr_s_bs_eohi) -print("Correlations: Brier vs Calibration (Spearman rho, p, n)") -print(corr_s_bs_cal) - -# Export a single CSV combining both sets -corr_bs_eohi$group <- "EOHI" -corr_bs_cal$group <- "Calibration" -corr_s_bs_eohi$group <- "EOHI" -corr_s_bs_cal$group <- "Calibration" - -corr_all <- dplyr::bind_rows(corr_bs_eohi, corr_bs_cal, corr_s_bs_eohi, corr_s_bs_cal) %>% - dplyr::relocate(group, .before = var_x) -write.csv(corr_all, "corr_bs_all.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - brier score x eohi and cal_20251008160500.r b/.history/eohi1/correlations - brier score x eohi and cal_20251008160500.r deleted file mode 100644 index cb2b33f..0000000 --- a/.history/eohi1/correlations - brier score x eohi and cal_20251008160500.r +++ /dev/null @@ -1,86 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("ehi1.csv") - -# Keep only required columns for the analysis -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -df1 <- df1 %>% dplyr::select(dplyr::all_of(c(bs_vars, eohi_vars, cal_vars))) - -# --- Brier score correlations vs EOHIs and Calibration --- - -# Variables -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -# Helper: tidy correlation (method = "pearson" or "spearman"), pairwise complete -corr_tidy <- function(df, x_vars, y_vars, method = "pearson") { - grid <- expand.grid(x = x_vars, y = y_vars, stringsAsFactors = FALSE) - results <- purrr::pmap_dfr(grid, function(x, y) { - xv <- df[[x]]; yv <- df[[y]] - ok <- is.finite(xv) & is.finite(yv) - if (sum(ok) < 3) { - return(tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = NA_real_, p = NA_real_, method = method)) - } - ct <- suppressWarnings(cor.test(xv[ok], yv[ok], method = method)) - tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = unname(ct$estimate), p = ct$p.value, method = method) - }) - dplyr::arrange(results, var_x, var_y) -} - -# Compute correlations (Pearson) -corr_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars, method = "pearson") -corr_bs_cal <- corr_tidy(df1, bs_vars, cal_vars, method = "pearson") - -# Compute correlations (Spearman) -corr_s_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars, method = "spearman") -corr_s_bs_cal <- corr_tidy(df1, bs_vars, cal_vars, method = "spearman") - -# Wide r-only tables (optional) -to_wide <- function(d) { - tidyr::pivot_wider(d, id_cols = var_x, names_from = var_y, values_from = r) -} -wide_bs_eohi <- to_wide(corr_bs_eohi) -wide_bs_cal <- to_wide(corr_bs_cal) - -# Display -print("Correlations: Brier vs EOHIs (Pearson r, p, n)") -print(corr_bs_eohi) -print("Correlations: Brier vs Calibration (Pearson r, p, n)") -print(corr_bs_cal) -print("Correlations: Brier vs EOHIs (Spearman rho, p, n)") -print(corr_s_bs_eohi) -print("Correlations: Brier vs Calibration (Spearman rho, p, n)") -print(corr_s_bs_cal) - -# Export a single CSV combining both sets -corr_bs_eohi$group <- "EOHI" -corr_bs_cal$group <- "Calibration" -corr_s_bs_eohi$group <- "EOHI" -corr_s_bs_cal$group <- "Calibration" - -corr_all <- dplyr::bind_rows(corr_bs_eohi, corr_bs_cal, corr_s_bs_eohi, corr_s_bs_cal) %>% - dplyr::relocate(group, .before = var_x) -write.csv(corr_all, "corr_bs_all.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - brier score x eohi and cal_20251008160505.r b/.history/eohi1/correlations - brier score x eohi and cal_20251008160505.r deleted file mode 100644 index cb2b33f..0000000 --- a/.history/eohi1/correlations - brier score x eohi and cal_20251008160505.r +++ /dev/null @@ -1,86 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("ehi1.csv") - -# Keep only required columns for the analysis -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -df1 <- df1 %>% dplyr::select(dplyr::all_of(c(bs_vars, eohi_vars, cal_vars))) - -# --- Brier score correlations vs EOHIs and Calibration --- - -# Variables -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -# Helper: tidy correlation (method = "pearson" or "spearman"), pairwise complete -corr_tidy <- function(df, x_vars, y_vars, method = "pearson") { - grid <- expand.grid(x = x_vars, y = y_vars, stringsAsFactors = FALSE) - results <- purrr::pmap_dfr(grid, function(x, y) { - xv <- df[[x]]; yv <- df[[y]] - ok <- is.finite(xv) & is.finite(yv) - if (sum(ok) < 3) { - return(tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = NA_real_, p = NA_real_, method = method)) - } - ct <- suppressWarnings(cor.test(xv[ok], yv[ok], method = method)) - tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = unname(ct$estimate), p = ct$p.value, method = method) - }) - dplyr::arrange(results, var_x, var_y) -} - -# Compute correlations (Pearson) -corr_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars, method = "pearson") -corr_bs_cal <- corr_tidy(df1, bs_vars, cal_vars, method = "pearson") - -# Compute correlations (Spearman) -corr_s_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars, method = "spearman") -corr_s_bs_cal <- corr_tidy(df1, bs_vars, cal_vars, method = "spearman") - -# Wide r-only tables (optional) -to_wide <- function(d) { - tidyr::pivot_wider(d, id_cols = var_x, names_from = var_y, values_from = r) -} -wide_bs_eohi <- to_wide(corr_bs_eohi) -wide_bs_cal <- to_wide(corr_bs_cal) - -# Display -print("Correlations: Brier vs EOHIs (Pearson r, p, n)") -print(corr_bs_eohi) -print("Correlations: Brier vs Calibration (Pearson r, p, n)") -print(corr_bs_cal) -print("Correlations: Brier vs EOHIs (Spearman rho, p, n)") -print(corr_s_bs_eohi) -print("Correlations: Brier vs Calibration (Spearman rho, p, n)") -print(corr_s_bs_cal) - -# Export a single CSV combining both sets -corr_bs_eohi$group <- "EOHI" -corr_bs_cal$group <- "Calibration" -corr_s_bs_eohi$group <- "EOHI" -corr_s_bs_cal$group <- "Calibration" - -corr_all <- dplyr::bind_rows(corr_bs_eohi, corr_bs_cal, corr_s_bs_eohi, corr_s_bs_cal) %>% - dplyr::relocate(group, .before = var_x) -write.csv(corr_all, "corr_bs_all.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - brier score x eohi and cal_20251008160539.r b/.history/eohi1/correlations - brier score x eohi and cal_20251008160539.r deleted file mode 100644 index 73f0756..0000000 --- a/.history/eohi1/correlations - brier score x eohi and cal_20251008160539.r +++ /dev/null @@ -1,76 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("ehi1.csv") - -# Keep only required columns for the analysis -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -df1 <- df1 %>% dplyr::select(dplyr::all_of(c(bs_vars, eohi_vars, cal_vars))) - -# --- Brier score correlations vs EOHIs and Calibration --- - -# Variables -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -# Helper: tidy correlation (method = "pearson" or "spearman"), pairwise complete -corr_tidy <- function(df, x_vars, y_vars, method = "pearson") { - grid <- expand.grid(x = x_vars, y = y_vars, stringsAsFactors = FALSE) - results <- purrr::pmap_dfr(grid, function(x, y) { - xv <- df[[x]]; yv <- df[[y]] - ok <- is.finite(xv) & is.finite(yv) - if (sum(ok) < 3) { - return(tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = NA_real_, p = NA_real_, method = method)) - } - ct <- suppressWarnings(cor.test(xv[ok], yv[ok], method = method)) - tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = unname(ct$estimate), p = ct$p.value, method = method) - }) - dplyr::arrange(results, var_x, var_y) -} - -# Compute correlations (Spearman only) -corr_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars, method = "spearman") -corr_bs_cal <- corr_tidy(df1, bs_vars, cal_vars, method = "spearman") - -# Wide r-only tables (optional) -to_wide <- function(d) { - tidyr::pivot_wider(d, id_cols = var_x, names_from = var_y, values_from = r) -} -wide_bs_eohi <- to_wide(corr_bs_eohi) -wide_bs_cal <- to_wide(corr_bs_cal) - -# Display -print("Correlations: Brier vs EOHIs (Spearman rho, p, n)") -print(corr_bs_eohi) -print("Correlations: Brier vs Calibration (Spearman rho, p, n)") -print(corr_bs_cal) - -# Export a single CSV combining both sets -corr_bs_eohi$group <- "EOHI" -corr_bs_cal$group <- "Calibration" - -corr_all <- dplyr::bind_rows(corr_bs_eohi, corr_bs_cal) %>% - dplyr::relocate(group, .before = var_x) -write.csv(corr_all, "corr_bs_all.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - brier score x eohi and cal_20251008160544.r b/.history/eohi1/correlations - brier score x eohi and cal_20251008160544.r deleted file mode 100644 index 73f0756..0000000 --- a/.history/eohi1/correlations - brier score x eohi and cal_20251008160544.r +++ /dev/null @@ -1,76 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("ehi1.csv") - -# Keep only required columns for the analysis -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -df1 <- df1 %>% dplyr::select(dplyr::all_of(c(bs_vars, eohi_vars, cal_vars))) - -# --- Brier score correlations vs EOHIs and Calibration --- - -# Variables -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -# Helper: tidy correlation (method = "pearson" or "spearman"), pairwise complete -corr_tidy <- function(df, x_vars, y_vars, method = "pearson") { - grid <- expand.grid(x = x_vars, y = y_vars, stringsAsFactors = FALSE) - results <- purrr::pmap_dfr(grid, function(x, y) { - xv <- df[[x]]; yv <- df[[y]] - ok <- is.finite(xv) & is.finite(yv) - if (sum(ok) < 3) { - return(tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = NA_real_, p = NA_real_, method = method)) - } - ct <- suppressWarnings(cor.test(xv[ok], yv[ok], method = method)) - tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = unname(ct$estimate), p = ct$p.value, method = method) - }) - dplyr::arrange(results, var_x, var_y) -} - -# Compute correlations (Spearman only) -corr_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars, method = "spearman") -corr_bs_cal <- corr_tidy(df1, bs_vars, cal_vars, method = "spearman") - -# Wide r-only tables (optional) -to_wide <- function(d) { - tidyr::pivot_wider(d, id_cols = var_x, names_from = var_y, values_from = r) -} -wide_bs_eohi <- to_wide(corr_bs_eohi) -wide_bs_cal <- to_wide(corr_bs_cal) - -# Display -print("Correlations: Brier vs EOHIs (Spearman rho, p, n)") -print(corr_bs_eohi) -print("Correlations: Brier vs Calibration (Spearman rho, p, n)") -print(corr_bs_cal) - -# Export a single CSV combining both sets -corr_bs_eohi$group <- "EOHI" -corr_bs_cal$group <- "Calibration" - -corr_all <- dplyr::bind_rows(corr_bs_eohi, corr_bs_cal) %>% - dplyr::relocate(group, .before = var_x) -write.csv(corr_all, "corr_bs_all.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - brier score x eohi and cal_20251008160550.r b/.history/eohi1/correlations - brier score x eohi and cal_20251008160550.r deleted file mode 100644 index 73f0756..0000000 --- a/.history/eohi1/correlations - brier score x eohi and cal_20251008160550.r +++ /dev/null @@ -1,76 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("ehi1.csv") - -# Keep only required columns for the analysis -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -df1 <- df1 %>% dplyr::select(dplyr::all_of(c(bs_vars, eohi_vars, cal_vars))) - -# --- Brier score correlations vs EOHIs and Calibration --- - -# Variables -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -# Helper: tidy correlation (method = "pearson" or "spearman"), pairwise complete -corr_tidy <- function(df, x_vars, y_vars, method = "pearson") { - grid <- expand.grid(x = x_vars, y = y_vars, stringsAsFactors = FALSE) - results <- purrr::pmap_dfr(grid, function(x, y) { - xv <- df[[x]]; yv <- df[[y]] - ok <- is.finite(xv) & is.finite(yv) - if (sum(ok) < 3) { - return(tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = NA_real_, p = NA_real_, method = method)) - } - ct <- suppressWarnings(cor.test(xv[ok], yv[ok], method = method)) - tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = unname(ct$estimate), p = ct$p.value, method = method) - }) - dplyr::arrange(results, var_x, var_y) -} - -# Compute correlations (Spearman only) -corr_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars, method = "spearman") -corr_bs_cal <- corr_tidy(df1, bs_vars, cal_vars, method = "spearman") - -# Wide r-only tables (optional) -to_wide <- function(d) { - tidyr::pivot_wider(d, id_cols = var_x, names_from = var_y, values_from = r) -} -wide_bs_eohi <- to_wide(corr_bs_eohi) -wide_bs_cal <- to_wide(corr_bs_cal) - -# Display -print("Correlations: Brier vs EOHIs (Spearman rho, p, n)") -print(corr_bs_eohi) -print("Correlations: Brier vs Calibration (Spearman rho, p, n)") -print(corr_bs_cal) - -# Export a single CSV combining both sets -corr_bs_eohi$group <- "EOHI" -corr_bs_cal$group <- "Calibration" - -corr_all <- dplyr::bind_rows(corr_bs_eohi, corr_bs_cal) %>% - dplyr::relocate(group, .before = var_x) -write.csv(corr_all, "corr_bs_all.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - brier score x eohi and cal_20251008162952.r b/.history/eohi1/correlations - brier score x eohi and cal_20251008162952.r deleted file mode 100644 index 73f0756..0000000 --- a/.history/eohi1/correlations - brier score x eohi and cal_20251008162952.r +++ /dev/null @@ -1,76 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("ehi1.csv") - -# Keep only required columns for the analysis -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -df1 <- df1 %>% dplyr::select(dplyr::all_of(c(bs_vars, eohi_vars, cal_vars))) - -# --- Brier score correlations vs EOHIs and Calibration --- - -# Variables -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -# Helper: tidy correlation (method = "pearson" or "spearman"), pairwise complete -corr_tidy <- function(df, x_vars, y_vars, method = "pearson") { - grid <- expand.grid(x = x_vars, y = y_vars, stringsAsFactors = FALSE) - results <- purrr::pmap_dfr(grid, function(x, y) { - xv <- df[[x]]; yv <- df[[y]] - ok <- is.finite(xv) & is.finite(yv) - if (sum(ok) < 3) { - return(tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = NA_real_, p = NA_real_, method = method)) - } - ct <- suppressWarnings(cor.test(xv[ok], yv[ok], method = method)) - tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = unname(ct$estimate), p = ct$p.value, method = method) - }) - dplyr::arrange(results, var_x, var_y) -} - -# Compute correlations (Spearman only) -corr_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars, method = "spearman") -corr_bs_cal <- corr_tidy(df1, bs_vars, cal_vars, method = "spearman") - -# Wide r-only tables (optional) -to_wide <- function(d) { - tidyr::pivot_wider(d, id_cols = var_x, names_from = var_y, values_from = r) -} -wide_bs_eohi <- to_wide(corr_bs_eohi) -wide_bs_cal <- to_wide(corr_bs_cal) - -# Display -print("Correlations: Brier vs EOHIs (Spearman rho, p, n)") -print(corr_bs_eohi) -print("Correlations: Brier vs Calibration (Spearman rho, p, n)") -print(corr_bs_cal) - -# Export a single CSV combining both sets -corr_bs_eohi$group <- "EOHI" -corr_bs_cal$group <- "Calibration" - -corr_all <- dplyr::bind_rows(corr_bs_eohi, corr_bs_cal) %>% - dplyr::relocate(group, .before = var_x) -write.csv(corr_all, "corr_bs_all.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - brier score x eohi and cal_20251008185636.r b/.history/eohi1/correlations - brier score x eohi and cal_20251008185636.r deleted file mode 100644 index b2dfd5d..0000000 --- a/.history/eohi1/correlations - brier score x eohi and cal_20251008185636.r +++ /dev/null @@ -1,81 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("ehi1.csv") - -# Keep only required columns for the analysis -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -df1 <- df1 %>% dplyr::select(dplyr::all_of(c(bs_vars, eohi_vars, cal_vars))) - -# --- Brier score correlations vs EOHIs and Calibration --- - -# Variables -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -# Helper: tidy correlation (method = "pearson" or "spearman"), pairwise complete -corr_tidy <- function(df, x_vars, y_vars, method = "pearson") { - grid <- expand.grid(x = x_vars, y = y_vars, stringsAsFactors = FALSE) - results <- purrr::pmap_dfr(grid, function(x, y) { - xv <- df[[x]]; yv <- df[[y]] - ok <- is.finite(xv) & is.finite(yv) - if (sum(ok) < 3) { - return(tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = NA_real_, p = NA_real_, method = method)) - } - ct <- suppressWarnings(cor.test(xv[ok], yv[ok], method = method)) - tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = unname(ct$estimate), p = ct$p.value, method = method) - }) - dplyr::arrange(results, var_x, var_y) -} - -# Compute correlations (Spearman only) -corr_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars, method = "spearman") -corr_bs_cal <- corr_tidy(df1, bs_vars, cal_vars, method = "spearman") -corr_cal_eohi <- corr_tidy(df1, cal_vars, eohi_vars, method = "spearman") - -# Wide r-only tables (optional) -to_wide <- function(d) { - tidyr::pivot_wider(d, id_cols = var_x, names_from = var_y, values_from = r) -} -wide_bs_eohi <- to_wide(corr_bs_eohi) -wide_bs_cal <- to_wide(corr_bs_cal) -wide_cal_eohi <- to_wide(corr_cal_eohi) - -# Display -print("Correlations: Brier vs EOHIs (Spearman rho, p, n)") -print(corr_bs_eohi) -print("Correlations: Brier vs Calibration (Spearman rho, p, n)") -print(corr_bs_cal) -print("Correlations: Calibration vs EOHIs (Spearman rho, p, n)") -print(corr_cal_eohi) - -# Export a single CSV combining all sets -corr_bs_eohi$group <- "BS_vs_EOHI" -corr_bs_cal$group <- "BS_vs_Cal" -corr_cal_eohi$group <- "Cal_vs_EOHI" - -corr_all <- dplyr::bind_rows(corr_bs_eohi, corr_bs_cal, corr_cal_eohi) %>% - dplyr::relocate(group, .before = var_x) -write.csv(corr_all, "corr_bs_all.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - brier score x eohi and cal_20251008185645.r b/.history/eohi1/correlations - brier score x eohi and cal_20251008185645.r deleted file mode 100644 index b2dfd5d..0000000 --- a/.history/eohi1/correlations - brier score x eohi and cal_20251008185645.r +++ /dev/null @@ -1,81 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("ehi1.csv") - -# Keep only required columns for the analysis -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -df1 <- df1 %>% dplyr::select(dplyr::all_of(c(bs_vars, eohi_vars, cal_vars))) - -# --- Brier score correlations vs EOHIs and Calibration --- - -# Variables -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -# Helper: tidy correlation (method = "pearson" or "spearman"), pairwise complete -corr_tidy <- function(df, x_vars, y_vars, method = "pearson") { - grid <- expand.grid(x = x_vars, y = y_vars, stringsAsFactors = FALSE) - results <- purrr::pmap_dfr(grid, function(x, y) { - xv <- df[[x]]; yv <- df[[y]] - ok <- is.finite(xv) & is.finite(yv) - if (sum(ok) < 3) { - return(tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = NA_real_, p = NA_real_, method = method)) - } - ct <- suppressWarnings(cor.test(xv[ok], yv[ok], method = method)) - tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = unname(ct$estimate), p = ct$p.value, method = method) - }) - dplyr::arrange(results, var_x, var_y) -} - -# Compute correlations (Spearman only) -corr_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars, method = "spearman") -corr_bs_cal <- corr_tidy(df1, bs_vars, cal_vars, method = "spearman") -corr_cal_eohi <- corr_tidy(df1, cal_vars, eohi_vars, method = "spearman") - -# Wide r-only tables (optional) -to_wide <- function(d) { - tidyr::pivot_wider(d, id_cols = var_x, names_from = var_y, values_from = r) -} -wide_bs_eohi <- to_wide(corr_bs_eohi) -wide_bs_cal <- to_wide(corr_bs_cal) -wide_cal_eohi <- to_wide(corr_cal_eohi) - -# Display -print("Correlations: Brier vs EOHIs (Spearman rho, p, n)") -print(corr_bs_eohi) -print("Correlations: Brier vs Calibration (Spearman rho, p, n)") -print(corr_bs_cal) -print("Correlations: Calibration vs EOHIs (Spearman rho, p, n)") -print(corr_cal_eohi) - -# Export a single CSV combining all sets -corr_bs_eohi$group <- "BS_vs_EOHI" -corr_bs_cal$group <- "BS_vs_Cal" -corr_cal_eohi$group <- "Cal_vs_EOHI" - -corr_all <- dplyr::bind_rows(corr_bs_eohi, corr_bs_cal, corr_cal_eohi) %>% - dplyr::relocate(group, .before = var_x) -write.csv(corr_all, "corr_bs_all.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - brier score x eohi and cal_20251008185658.r b/.history/eohi1/correlations - brier score x eohi and cal_20251008185658.r deleted file mode 100644 index b2dfd5d..0000000 --- a/.history/eohi1/correlations - brier score x eohi and cal_20251008185658.r +++ /dev/null @@ -1,81 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("ehi1.csv") - -# Keep only required columns for the analysis -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -df1 <- df1 %>% dplyr::select(dplyr::all_of(c(bs_vars, eohi_vars, cal_vars))) - -# --- Brier score correlations vs EOHIs and Calibration --- - -# Variables -bs_vars <- c("bs_28", "bs_easy", "bs_hard") -eohi_vars <- c( - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean" -) -cal_vars <- c("cal_selfActual","cal_global") - -# Helper: tidy correlation (method = "pearson" or "spearman"), pairwise complete -corr_tidy <- function(df, x_vars, y_vars, method = "pearson") { - grid <- expand.grid(x = x_vars, y = y_vars, stringsAsFactors = FALSE) - results <- purrr::pmap_dfr(grid, function(x, y) { - xv <- df[[x]]; yv <- df[[y]] - ok <- is.finite(xv) & is.finite(yv) - if (sum(ok) < 3) { - return(tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = NA_real_, p = NA_real_, method = method)) - } - ct <- suppressWarnings(cor.test(xv[ok], yv[ok], method = method)) - tibble::tibble(var_x = x, var_y = y, n = sum(ok), r = unname(ct$estimate), p = ct$p.value, method = method) - }) - dplyr::arrange(results, var_x, var_y) -} - -# Compute correlations (Spearman only) -corr_bs_eohi <- corr_tidy(df1, bs_vars, eohi_vars, method = "spearman") -corr_bs_cal <- corr_tidy(df1, bs_vars, cal_vars, method = "spearman") -corr_cal_eohi <- corr_tidy(df1, cal_vars, eohi_vars, method = "spearman") - -# Wide r-only tables (optional) -to_wide <- function(d) { - tidyr::pivot_wider(d, id_cols = var_x, names_from = var_y, values_from = r) -} -wide_bs_eohi <- to_wide(corr_bs_eohi) -wide_bs_cal <- to_wide(corr_bs_cal) -wide_cal_eohi <- to_wide(corr_cal_eohi) - -# Display -print("Correlations: Brier vs EOHIs (Spearman rho, p, n)") -print(corr_bs_eohi) -print("Correlations: Brier vs Calibration (Spearman rho, p, n)") -print(corr_bs_cal) -print("Correlations: Calibration vs EOHIs (Spearman rho, p, n)") -print(corr_cal_eohi) - -# Export a single CSV combining all sets -corr_bs_eohi$group <- "BS_vs_EOHI" -corr_bs_cal$group <- "BS_vs_Cal" -corr_cal_eohi$group <- "Cal_vs_EOHI" - -corr_all <- dplyr::bind_rows(corr_bs_eohi, corr_bs_cal, corr_cal_eohi) %>% - dplyr::relocate(group, .before = var_x) -write.csv(corr_all, "corr_bs_all.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - eohi x calibration_20250915134710.r b/.history/eohi1/correlations - eohi x calibration_20250915134710.r deleted file mode 100644 index afe6d33..0000000 --- a/.history/eohi1/correlations - eohi x calibration_20250915134710.r +++ /dev/null @@ -1,304 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -# Load data -df1 <- read.csv("exp1.csv") - -# Remove columns with all NA values -df1 <- df1 %>% select(where(~ !all(is.na(.)))) - -# Select variables of interest -eohi_vars <- c("eohi_pref", "eohi_pers", "eohi_val", "eohi_life", "eohi_mean", - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean") -cal_vars <- c("cal_selfActual", "cal_global") - -# Create dataset with selected variables -df <- df1[, c(eohi_vars, cal_vars)] - -# Ensure all selected variables are numeric -df <- df %>% - mutate(across(everything(), as.numeric)) - -# Remove rows with any missing values for correlation analysis -df_complete <- df[complete.cases(df), ] - -cat("Sample size for correlation analysis:", nrow(df_complete), "\n") -cat("Total sample size:", nrow(df), "\n") - -####==== DESCRIPTIVE STATISTICS ==== - -# Function to compute descriptive statistics -get_descriptives <- function(data, vars) { - desc_stats <- data %>% - select(all_of(vars)) %>% - summarise(across(everything(), list( - n = ~sum(!is.na(.)), - mean = ~mean(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - max = ~max(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - q25 = ~quantile(., 0.25, na.rm = TRUE), - q75 = ~quantile(., 0.75, na.rm = TRUE) - ))) %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - separate(variable, into = c("var", "stat"), sep = "_(?=[^_]+$)") %>% - pivot_wider(names_from = stat, values_from = value) %>% - mutate(across(c(mean, sd, min, max, median, q25, q75), ~round(., 5))) - - return(desc_stats) -} - -# Get descriptives for EOHI variables -eohi_descriptives <- get_descriptives(df, eohi_vars) -cat("\n=== EOHI Variables Descriptives ===\n") -print(eohi_descriptives) - -# Get descriptives for calibration variables -cal_descriptives <- get_descriptives(df, cal_vars) -cat("\n=== Calibration Variables Descriptives ===\n") -print(cal_descriptives) - -####==== PEARSON CORRELATIONS ==== - -# Compute correlation matrix with p-values -cor_results_pearson <- rcorr(as.matrix(df_complete), type = "pearson") - -# Extract correlation coefficients -cor_pearson <- cor_results_pearson$r - -# Extract p-values -p_matrix_pearson <- cor_results_pearson$P - -# Function to add significance stars -corstars <- function(cor_mat, p_mat) { - stars <- ifelse(p_mat < 0.001, "***", - ifelse(p_mat < 0.01, "**", - ifelse(p_mat < 0.05, "*", ""))) - - # Combine correlation values with stars, rounded to 5 decimal places - cor_with_stars <- matrix(paste0(format(round(cor_mat, 5), nsmall = 5), stars), - nrow = nrow(cor_mat)) - - # Set row and column names - rownames(cor_with_stars) <- rownames(cor_mat) - colnames(cor_with_stars) <- colnames(cor_mat) - - return(cor_with_stars) -} - -# Apply the function -cor_table_pearson <- corstars(cor_pearson, p_matrix_pearson) - -cat("\n=== PEARSON CORRELATIONS ===\n") -print(cor_table_pearson, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations <- cor_pearson[eohi_vars, cal_vars] -eohi_cal_pvalues <- p_matrix_pearson[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Pearson Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations)) { - for(j in 1:ncol(eohi_cal_correlations)) { - cor_val <- eohi_cal_correlations[i, j] - p_val <- eohi_cal_pvalues[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: r = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations)[i], - colnames(eohi_cal_correlations)[j], - cor_val, star, p_val)) - } -} - -####==== SPEARMAN CORRELATIONS ==== - -# Compute Spearman correlation matrix with p-values -cor_results_spearman <- rcorr(as.matrix(df_complete), type = "spearman") - -# Extract correlation coefficients -cor_spearman <- cor_results_spearman$r - -# Extract p-values -p_matrix_spearman <- cor_results_spearman$P - -# Apply the function -cor_table_spearman <- corstars(cor_spearman, p_matrix_spearman) - -cat("\n=== SPEARMAN CORRELATIONS ===\n") -print(cor_table_spearman, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations_spearman <- cor_spearman[eohi_vars, cal_vars] -eohi_cal_pvalues_spearman <- p_matrix_spearman[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Spearman Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations_spearman)) { - for(j in 1:ncol(eohi_cal_correlations_spearman)) { - cor_val <- eohi_cal_correlations_spearman[i, j] - p_val <- eohi_cal_pvalues_spearman[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: rho = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations_spearman)[i], - colnames(eohi_cal_correlations_spearman)[j], - cor_val, star, p_val)) - } -} - -####==== BOOTSTRAPPED 95% CONFIDENCE INTERVALS ==== - -# Function to compute correlation with bootstrap CI -bootstrap_correlation <- function(data, var1, var2, method = "pearson", R = 1000) { - # Remove missing values - complete_data <- data[complete.cases(data[, c(var1, var2)]), ] - - if(nrow(complete_data) < 3) { - return(data.frame( - correlation = NA, - ci_lower = NA, - ci_upper = NA, - n = nrow(complete_data) - )) - } - - # Bootstrap function - boot_fun <- function(data, indices) { - cor(data[indices, var1], data[indices, var2], method = method, use = "complete.obs") - } - - # Perform bootstrap - set.seed(123) # for reproducibility - boot_results <- boot(complete_data, boot_fun, R = R) - - # Calculate confidence interval - ci <- boot.ci(boot_results, type = "perc") - - return(data.frame( - correlation = boot_results$t0, - ci_lower = ci$perc[4], - ci_upper = ci$perc[5], - n = nrow(complete_data) - )) -} - -# Compute bootstrap CIs for all EOHI x Calibration correlations -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (PEARSON) ===\n") -bootstrap_results_pearson <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "pearson", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "pearson" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_pearson) - -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (SPEARMAN) ===\n") -bootstrap_results_spearman <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "spearman", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "spearman" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_spearman) - -####==== SUMMARY TABLE ==== - -# Create comprehensive summary table -summary_table <- bootstrap_results_pearson %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper, n) %>% - rename(pearson_r = correlation, pearson_ci_lower = ci_lower, pearson_ci_upper = ci_upper) %>% - left_join( - bootstrap_results_spearman %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper) %>% - rename(spearman_rho = correlation, spearman_ci_lower = ci_lower, spearman_ci_upper = ci_upper), - by = c("eohi_var", "cal_var") - ) %>% - # Add p-values - left_join( - expand.grid(eohi_var = eohi_vars, cal_var = cal_vars, stringsAsFactors = FALSE) %>% - pmap_dfr(function(eohi_var, cal_var) { - pearson_p <- p_matrix_pearson[eohi_var, cal_var] - spearman_p <- p_matrix_spearman[eohi_var, cal_var] - data.frame( - eohi_var = eohi_var, - cal_var = cal_var, - pearson_p = pearson_p, - spearman_p = spearman_p - ) - }), - by = c("eohi_var", "cal_var") - ) %>% - mutate( - pearson_p = round(pearson_p, 5), - spearman_p = round(spearman_p, 5) - ) - -cat("\n=== COMPREHENSIVE SUMMARY TABLE ===\n") -print(summary_table) - -# Save results to CSV -write.csv(summary_table, "eohi_calibration_correlations_summary.csv", row.names = FALSE) -cat("\nResults saved to: eohi_calibration_correlations_summary.csv\n") - -####==== EFFECT SIZES (Cohen's conventions) ==== - -cat("\n=== EFFECT SIZE INTERPRETATION (Cohen's conventions) ===\n") -cat("Small effect: |r| = 0.10\n") -cat("Medium effect: |r| = 0.30\n") -cat("Large effect: |r| = 0.50\n") - -# Categorize effect sizes -summary_table_with_effects <- summary_table %>% - mutate( - pearson_effect_size = case_when( - abs(pearson_r) >= 0.50 ~ "Large", - abs(pearson_r) >= 0.30 ~ "Medium", - abs(pearson_r) >= 0.10 ~ "Small", - TRUE ~ "Negligible" - ), - spearman_effect_size = case_when( - abs(spearman_rho) >= 0.50 ~ "Large", - abs(spearman_rho) >= 0.30 ~ "Medium", - abs(spearman_rho) >= 0.10 ~ "Small", - TRUE ~ "Negligible" - ) - ) - -cat("\n=== EFFECT SIZE CATEGORIZATION ===\n") -print(summary_table_with_effects %>% select(eohi_var, cal_var, pearson_r, pearson_effect_size, spearman_rho, spearman_effect_size)) diff --git a/.history/eohi1/correlations - eohi x calibration_20250915134720.r b/.history/eohi1/correlations - eohi x calibration_20250915134720.r deleted file mode 100644 index afe6d33..0000000 --- a/.history/eohi1/correlations - eohi x calibration_20250915134720.r +++ /dev/null @@ -1,304 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -# Load data -df1 <- read.csv("exp1.csv") - -# Remove columns with all NA values -df1 <- df1 %>% select(where(~ !all(is.na(.)))) - -# Select variables of interest -eohi_vars <- c("eohi_pref", "eohi_pers", "eohi_val", "eohi_life", "eohi_mean", - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean") -cal_vars <- c("cal_selfActual", "cal_global") - -# Create dataset with selected variables -df <- df1[, c(eohi_vars, cal_vars)] - -# Ensure all selected variables are numeric -df <- df %>% - mutate(across(everything(), as.numeric)) - -# Remove rows with any missing values for correlation analysis -df_complete <- df[complete.cases(df), ] - -cat("Sample size for correlation analysis:", nrow(df_complete), "\n") -cat("Total sample size:", nrow(df), "\n") - -####==== DESCRIPTIVE STATISTICS ==== - -# Function to compute descriptive statistics -get_descriptives <- function(data, vars) { - desc_stats <- data %>% - select(all_of(vars)) %>% - summarise(across(everything(), list( - n = ~sum(!is.na(.)), - mean = ~mean(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - max = ~max(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - q25 = ~quantile(., 0.25, na.rm = TRUE), - q75 = ~quantile(., 0.75, na.rm = TRUE) - ))) %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - separate(variable, into = c("var", "stat"), sep = "_(?=[^_]+$)") %>% - pivot_wider(names_from = stat, values_from = value) %>% - mutate(across(c(mean, sd, min, max, median, q25, q75), ~round(., 5))) - - return(desc_stats) -} - -# Get descriptives for EOHI variables -eohi_descriptives <- get_descriptives(df, eohi_vars) -cat("\n=== EOHI Variables Descriptives ===\n") -print(eohi_descriptives) - -# Get descriptives for calibration variables -cal_descriptives <- get_descriptives(df, cal_vars) -cat("\n=== Calibration Variables Descriptives ===\n") -print(cal_descriptives) - -####==== PEARSON CORRELATIONS ==== - -# Compute correlation matrix with p-values -cor_results_pearson <- rcorr(as.matrix(df_complete), type = "pearson") - -# Extract correlation coefficients -cor_pearson <- cor_results_pearson$r - -# Extract p-values -p_matrix_pearson <- cor_results_pearson$P - -# Function to add significance stars -corstars <- function(cor_mat, p_mat) { - stars <- ifelse(p_mat < 0.001, "***", - ifelse(p_mat < 0.01, "**", - ifelse(p_mat < 0.05, "*", ""))) - - # Combine correlation values with stars, rounded to 5 decimal places - cor_with_stars <- matrix(paste0(format(round(cor_mat, 5), nsmall = 5), stars), - nrow = nrow(cor_mat)) - - # Set row and column names - rownames(cor_with_stars) <- rownames(cor_mat) - colnames(cor_with_stars) <- colnames(cor_mat) - - return(cor_with_stars) -} - -# Apply the function -cor_table_pearson <- corstars(cor_pearson, p_matrix_pearson) - -cat("\n=== PEARSON CORRELATIONS ===\n") -print(cor_table_pearson, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations <- cor_pearson[eohi_vars, cal_vars] -eohi_cal_pvalues <- p_matrix_pearson[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Pearson Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations)) { - for(j in 1:ncol(eohi_cal_correlations)) { - cor_val <- eohi_cal_correlations[i, j] - p_val <- eohi_cal_pvalues[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: r = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations)[i], - colnames(eohi_cal_correlations)[j], - cor_val, star, p_val)) - } -} - -####==== SPEARMAN CORRELATIONS ==== - -# Compute Spearman correlation matrix with p-values -cor_results_spearman <- rcorr(as.matrix(df_complete), type = "spearman") - -# Extract correlation coefficients -cor_spearman <- cor_results_spearman$r - -# Extract p-values -p_matrix_spearman <- cor_results_spearman$P - -# Apply the function -cor_table_spearman <- corstars(cor_spearman, p_matrix_spearman) - -cat("\n=== SPEARMAN CORRELATIONS ===\n") -print(cor_table_spearman, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations_spearman <- cor_spearman[eohi_vars, cal_vars] -eohi_cal_pvalues_spearman <- p_matrix_spearman[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Spearman Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations_spearman)) { - for(j in 1:ncol(eohi_cal_correlations_spearman)) { - cor_val <- eohi_cal_correlations_spearman[i, j] - p_val <- eohi_cal_pvalues_spearman[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: rho = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations_spearman)[i], - colnames(eohi_cal_correlations_spearman)[j], - cor_val, star, p_val)) - } -} - -####==== BOOTSTRAPPED 95% CONFIDENCE INTERVALS ==== - -# Function to compute correlation with bootstrap CI -bootstrap_correlation <- function(data, var1, var2, method = "pearson", R = 1000) { - # Remove missing values - complete_data <- data[complete.cases(data[, c(var1, var2)]), ] - - if(nrow(complete_data) < 3) { - return(data.frame( - correlation = NA, - ci_lower = NA, - ci_upper = NA, - n = nrow(complete_data) - )) - } - - # Bootstrap function - boot_fun <- function(data, indices) { - cor(data[indices, var1], data[indices, var2], method = method, use = "complete.obs") - } - - # Perform bootstrap - set.seed(123) # for reproducibility - boot_results <- boot(complete_data, boot_fun, R = R) - - # Calculate confidence interval - ci <- boot.ci(boot_results, type = "perc") - - return(data.frame( - correlation = boot_results$t0, - ci_lower = ci$perc[4], - ci_upper = ci$perc[5], - n = nrow(complete_data) - )) -} - -# Compute bootstrap CIs for all EOHI x Calibration correlations -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (PEARSON) ===\n") -bootstrap_results_pearson <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "pearson", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "pearson" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_pearson) - -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (SPEARMAN) ===\n") -bootstrap_results_spearman <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "spearman", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "spearman" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_spearman) - -####==== SUMMARY TABLE ==== - -# Create comprehensive summary table -summary_table <- bootstrap_results_pearson %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper, n) %>% - rename(pearson_r = correlation, pearson_ci_lower = ci_lower, pearson_ci_upper = ci_upper) %>% - left_join( - bootstrap_results_spearman %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper) %>% - rename(spearman_rho = correlation, spearman_ci_lower = ci_lower, spearman_ci_upper = ci_upper), - by = c("eohi_var", "cal_var") - ) %>% - # Add p-values - left_join( - expand.grid(eohi_var = eohi_vars, cal_var = cal_vars, stringsAsFactors = FALSE) %>% - pmap_dfr(function(eohi_var, cal_var) { - pearson_p <- p_matrix_pearson[eohi_var, cal_var] - spearman_p <- p_matrix_spearman[eohi_var, cal_var] - data.frame( - eohi_var = eohi_var, - cal_var = cal_var, - pearson_p = pearson_p, - spearman_p = spearman_p - ) - }), - by = c("eohi_var", "cal_var") - ) %>% - mutate( - pearson_p = round(pearson_p, 5), - spearman_p = round(spearman_p, 5) - ) - -cat("\n=== COMPREHENSIVE SUMMARY TABLE ===\n") -print(summary_table) - -# Save results to CSV -write.csv(summary_table, "eohi_calibration_correlations_summary.csv", row.names = FALSE) -cat("\nResults saved to: eohi_calibration_correlations_summary.csv\n") - -####==== EFFECT SIZES (Cohen's conventions) ==== - -cat("\n=== EFFECT SIZE INTERPRETATION (Cohen's conventions) ===\n") -cat("Small effect: |r| = 0.10\n") -cat("Medium effect: |r| = 0.30\n") -cat("Large effect: |r| = 0.50\n") - -# Categorize effect sizes -summary_table_with_effects <- summary_table %>% - mutate( - pearson_effect_size = case_when( - abs(pearson_r) >= 0.50 ~ "Large", - abs(pearson_r) >= 0.30 ~ "Medium", - abs(pearson_r) >= 0.10 ~ "Small", - TRUE ~ "Negligible" - ), - spearman_effect_size = case_when( - abs(spearman_rho) >= 0.50 ~ "Large", - abs(spearman_rho) >= 0.30 ~ "Medium", - abs(spearman_rho) >= 0.10 ~ "Small", - TRUE ~ "Negligible" - ) - ) - -cat("\n=== EFFECT SIZE CATEGORIZATION ===\n") -print(summary_table_with_effects %>% select(eohi_var, cal_var, pearson_r, pearson_effect_size, spearman_rho, spearman_effect_size)) diff --git a/.history/eohi1/correlations - eohi x calibration_20250915134733.r b/.history/eohi1/correlations - eohi x calibration_20250915134733.r deleted file mode 100644 index afe6d33..0000000 --- a/.history/eohi1/correlations - eohi x calibration_20250915134733.r +++ /dev/null @@ -1,304 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -# Load data -df1 <- read.csv("exp1.csv") - -# Remove columns with all NA values -df1 <- df1 %>% select(where(~ !all(is.na(.)))) - -# Select variables of interest -eohi_vars <- c("eohi_pref", "eohi_pers", "eohi_val", "eohi_life", "eohi_mean", - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean") -cal_vars <- c("cal_selfActual", "cal_global") - -# Create dataset with selected variables -df <- df1[, c(eohi_vars, cal_vars)] - -# Ensure all selected variables are numeric -df <- df %>% - mutate(across(everything(), as.numeric)) - -# Remove rows with any missing values for correlation analysis -df_complete <- df[complete.cases(df), ] - -cat("Sample size for correlation analysis:", nrow(df_complete), "\n") -cat("Total sample size:", nrow(df), "\n") - -####==== DESCRIPTIVE STATISTICS ==== - -# Function to compute descriptive statistics -get_descriptives <- function(data, vars) { - desc_stats <- data %>% - select(all_of(vars)) %>% - summarise(across(everything(), list( - n = ~sum(!is.na(.)), - mean = ~mean(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - max = ~max(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - q25 = ~quantile(., 0.25, na.rm = TRUE), - q75 = ~quantile(., 0.75, na.rm = TRUE) - ))) %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - separate(variable, into = c("var", "stat"), sep = "_(?=[^_]+$)") %>% - pivot_wider(names_from = stat, values_from = value) %>% - mutate(across(c(mean, sd, min, max, median, q25, q75), ~round(., 5))) - - return(desc_stats) -} - -# Get descriptives for EOHI variables -eohi_descriptives <- get_descriptives(df, eohi_vars) -cat("\n=== EOHI Variables Descriptives ===\n") -print(eohi_descriptives) - -# Get descriptives for calibration variables -cal_descriptives <- get_descriptives(df, cal_vars) -cat("\n=== Calibration Variables Descriptives ===\n") -print(cal_descriptives) - -####==== PEARSON CORRELATIONS ==== - -# Compute correlation matrix with p-values -cor_results_pearson <- rcorr(as.matrix(df_complete), type = "pearson") - -# Extract correlation coefficients -cor_pearson <- cor_results_pearson$r - -# Extract p-values -p_matrix_pearson <- cor_results_pearson$P - -# Function to add significance stars -corstars <- function(cor_mat, p_mat) { - stars <- ifelse(p_mat < 0.001, "***", - ifelse(p_mat < 0.01, "**", - ifelse(p_mat < 0.05, "*", ""))) - - # Combine correlation values with stars, rounded to 5 decimal places - cor_with_stars <- matrix(paste0(format(round(cor_mat, 5), nsmall = 5), stars), - nrow = nrow(cor_mat)) - - # Set row and column names - rownames(cor_with_stars) <- rownames(cor_mat) - colnames(cor_with_stars) <- colnames(cor_mat) - - return(cor_with_stars) -} - -# Apply the function -cor_table_pearson <- corstars(cor_pearson, p_matrix_pearson) - -cat("\n=== PEARSON CORRELATIONS ===\n") -print(cor_table_pearson, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations <- cor_pearson[eohi_vars, cal_vars] -eohi_cal_pvalues <- p_matrix_pearson[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Pearson Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations)) { - for(j in 1:ncol(eohi_cal_correlations)) { - cor_val <- eohi_cal_correlations[i, j] - p_val <- eohi_cal_pvalues[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: r = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations)[i], - colnames(eohi_cal_correlations)[j], - cor_val, star, p_val)) - } -} - -####==== SPEARMAN CORRELATIONS ==== - -# Compute Spearman correlation matrix with p-values -cor_results_spearman <- rcorr(as.matrix(df_complete), type = "spearman") - -# Extract correlation coefficients -cor_spearman <- cor_results_spearman$r - -# Extract p-values -p_matrix_spearman <- cor_results_spearman$P - -# Apply the function -cor_table_spearman <- corstars(cor_spearman, p_matrix_spearman) - -cat("\n=== SPEARMAN CORRELATIONS ===\n") -print(cor_table_spearman, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations_spearman <- cor_spearman[eohi_vars, cal_vars] -eohi_cal_pvalues_spearman <- p_matrix_spearman[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Spearman Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations_spearman)) { - for(j in 1:ncol(eohi_cal_correlations_spearman)) { - cor_val <- eohi_cal_correlations_spearman[i, j] - p_val <- eohi_cal_pvalues_spearman[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: rho = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations_spearman)[i], - colnames(eohi_cal_correlations_spearman)[j], - cor_val, star, p_val)) - } -} - -####==== BOOTSTRAPPED 95% CONFIDENCE INTERVALS ==== - -# Function to compute correlation with bootstrap CI -bootstrap_correlation <- function(data, var1, var2, method = "pearson", R = 1000) { - # Remove missing values - complete_data <- data[complete.cases(data[, c(var1, var2)]), ] - - if(nrow(complete_data) < 3) { - return(data.frame( - correlation = NA, - ci_lower = NA, - ci_upper = NA, - n = nrow(complete_data) - )) - } - - # Bootstrap function - boot_fun <- function(data, indices) { - cor(data[indices, var1], data[indices, var2], method = method, use = "complete.obs") - } - - # Perform bootstrap - set.seed(123) # for reproducibility - boot_results <- boot(complete_data, boot_fun, R = R) - - # Calculate confidence interval - ci <- boot.ci(boot_results, type = "perc") - - return(data.frame( - correlation = boot_results$t0, - ci_lower = ci$perc[4], - ci_upper = ci$perc[5], - n = nrow(complete_data) - )) -} - -# Compute bootstrap CIs for all EOHI x Calibration correlations -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (PEARSON) ===\n") -bootstrap_results_pearson <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "pearson", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "pearson" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_pearson) - -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (SPEARMAN) ===\n") -bootstrap_results_spearman <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "spearman", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "spearman" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_spearman) - -####==== SUMMARY TABLE ==== - -# Create comprehensive summary table -summary_table <- bootstrap_results_pearson %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper, n) %>% - rename(pearson_r = correlation, pearson_ci_lower = ci_lower, pearson_ci_upper = ci_upper) %>% - left_join( - bootstrap_results_spearman %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper) %>% - rename(spearman_rho = correlation, spearman_ci_lower = ci_lower, spearman_ci_upper = ci_upper), - by = c("eohi_var", "cal_var") - ) %>% - # Add p-values - left_join( - expand.grid(eohi_var = eohi_vars, cal_var = cal_vars, stringsAsFactors = FALSE) %>% - pmap_dfr(function(eohi_var, cal_var) { - pearson_p <- p_matrix_pearson[eohi_var, cal_var] - spearman_p <- p_matrix_spearman[eohi_var, cal_var] - data.frame( - eohi_var = eohi_var, - cal_var = cal_var, - pearson_p = pearson_p, - spearman_p = spearman_p - ) - }), - by = c("eohi_var", "cal_var") - ) %>% - mutate( - pearson_p = round(pearson_p, 5), - spearman_p = round(spearman_p, 5) - ) - -cat("\n=== COMPREHENSIVE SUMMARY TABLE ===\n") -print(summary_table) - -# Save results to CSV -write.csv(summary_table, "eohi_calibration_correlations_summary.csv", row.names = FALSE) -cat("\nResults saved to: eohi_calibration_correlations_summary.csv\n") - -####==== EFFECT SIZES (Cohen's conventions) ==== - -cat("\n=== EFFECT SIZE INTERPRETATION (Cohen's conventions) ===\n") -cat("Small effect: |r| = 0.10\n") -cat("Medium effect: |r| = 0.30\n") -cat("Large effect: |r| = 0.50\n") - -# Categorize effect sizes -summary_table_with_effects <- summary_table %>% - mutate( - pearson_effect_size = case_when( - abs(pearson_r) >= 0.50 ~ "Large", - abs(pearson_r) >= 0.30 ~ "Medium", - abs(pearson_r) >= 0.10 ~ "Small", - TRUE ~ "Negligible" - ), - spearman_effect_size = case_when( - abs(spearman_rho) >= 0.50 ~ "Large", - abs(spearman_rho) >= 0.30 ~ "Medium", - abs(spearman_rho) >= 0.10 ~ "Small", - TRUE ~ "Negligible" - ) - ) - -cat("\n=== EFFECT SIZE CATEGORIZATION ===\n") -print(summary_table_with_effects %>% select(eohi_var, cal_var, pearson_r, pearson_effect_size, spearman_rho, spearman_effect_size)) diff --git a/.history/eohi1/correlations - eohi x calibration_20250915134827.r b/.history/eohi1/correlations - eohi x calibration_20250915134827.r deleted file mode 100644 index 8b475bc..0000000 --- a/.history/eohi1/correlations - eohi x calibration_20250915134827.r +++ /dev/null @@ -1,306 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -# Load data -df1 <- read.csv("exp1.csv") - -# Remove columns with all NA values -df1 <- df1 %>% select(where(~ !all(is.na(.)))) - -# Select variables of interest -eohi_vars <- c("eohi_pref", "eohi_pers", "eohi_val", "eohi_life", "eohi_mean", - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean") -cal_vars <- c("cal_selfActual", "cal_global") - -# Create dataset with selected variables -df <- df1[, c(eohi_vars, cal_vars)] - -# Ensure all selected variables are numeric -df <- df %>% - mutate(across(everything(), as.numeric)) - -# Remove rows with any missing values for correlation analysis -df_complete <- df[complete.cases(df), ] - -cat("Sample size for correlation analysis:", nrow(df_complete), "\n") -cat("Total sample size:", nrow(df), "\n") - -str(df) -summary(df) -####==== DESCRIPTIVE STATISTICS ==== - -# Function to compute descriptive statistics -get_descriptives <- function(data, vars) { - desc_stats <- data %>% - select(all_of(vars)) %>% - summarise(across(everything(), list( - n = ~sum(!is.na(.)), - mean = ~mean(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - max = ~max(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - q25 = ~quantile(., 0.25, na.rm = TRUE), - q75 = ~quantile(., 0.75, na.rm = TRUE) - ))) %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - separate(variable, into = c("var", "stat"), sep = "_(?=[^_]+$)") %>% - pivot_wider(names_from = stat, values_from = value) %>% - mutate(across(c(mean, sd, min, max, median, q25, q75), ~round(., 5))) - - return(desc_stats) -} - -# Get descriptives for EOHI variables -eohi_descriptives <- get_descriptives(df, eohi_vars) -cat("\n=== EOHI Variables Descriptives ===\n") -print(eohi_descriptives) - -# Get descriptives for calibration variables -cal_descriptives <- get_descriptives(df, cal_vars) -cat("\n=== Calibration Variables Descriptives ===\n") -print(cal_descriptives) - -####==== PEARSON CORRELATIONS ==== - -# Compute correlation matrix with p-values -cor_results_pearson <- rcorr(as.matrix(df_complete), type = "pearson") - -# Extract correlation coefficients -cor_pearson <- cor_results_pearson$r - -# Extract p-values -p_matrix_pearson <- cor_results_pearson$P - -# Function to add significance stars -corstars <- function(cor_mat, p_mat) { - stars <- ifelse(p_mat < 0.001, "***", - ifelse(p_mat < 0.01, "**", - ifelse(p_mat < 0.05, "*", ""))) - - # Combine correlation values with stars, rounded to 5 decimal places - cor_with_stars <- matrix(paste0(format(round(cor_mat, 5), nsmall = 5), stars), - nrow = nrow(cor_mat)) - - # Set row and column names - rownames(cor_with_stars) <- rownames(cor_mat) - colnames(cor_with_stars) <- colnames(cor_mat) - - return(cor_with_stars) -} - -# Apply the function -cor_table_pearson <- corstars(cor_pearson, p_matrix_pearson) - -cat("\n=== PEARSON CORRELATIONS ===\n") -print(cor_table_pearson, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations <- cor_pearson[eohi_vars, cal_vars] -eohi_cal_pvalues <- p_matrix_pearson[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Pearson Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations)) { - for(j in 1:ncol(eohi_cal_correlations)) { - cor_val <- eohi_cal_correlations[i, j] - p_val <- eohi_cal_pvalues[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: r = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations)[i], - colnames(eohi_cal_correlations)[j], - cor_val, star, p_val)) - } -} - -####==== SPEARMAN CORRELATIONS ==== - -# Compute Spearman correlation matrix with p-values -cor_results_spearman <- rcorr(as.matrix(df_complete), type = "spearman") - -# Extract correlation coefficients -cor_spearman <- cor_results_spearman$r - -# Extract p-values -p_matrix_spearman <- cor_results_spearman$P - -# Apply the function -cor_table_spearman <- corstars(cor_spearman, p_matrix_spearman) - -cat("\n=== SPEARMAN CORRELATIONS ===\n") -print(cor_table_spearman, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations_spearman <- cor_spearman[eohi_vars, cal_vars] -eohi_cal_pvalues_spearman <- p_matrix_spearman[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Spearman Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations_spearman)) { - for(j in 1:ncol(eohi_cal_correlations_spearman)) { - cor_val <- eohi_cal_correlations_spearman[i, j] - p_val <- eohi_cal_pvalues_spearman[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: rho = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations_spearman)[i], - colnames(eohi_cal_correlations_spearman)[j], - cor_val, star, p_val)) - } -} - -####==== BOOTSTRAPPED 95% CONFIDENCE INTERVALS ==== - -# Function to compute correlation with bootstrap CI -bootstrap_correlation <- function(data, var1, var2, method = "pearson", R = 1000) { - # Remove missing values - complete_data <- data[complete.cases(data[, c(var1, var2)]), ] - - if(nrow(complete_data) < 3) { - return(data.frame( - correlation = NA, - ci_lower = NA, - ci_upper = NA, - n = nrow(complete_data) - )) - } - - # Bootstrap function - boot_fun <- function(data, indices) { - cor(data[indices, var1], data[indices, var2], method = method, use = "complete.obs") - } - - # Perform bootstrap - set.seed(123) # for reproducibility - boot_results <- boot(complete_data, boot_fun, R = R) - - # Calculate confidence interval - ci <- boot.ci(boot_results, type = "perc") - - return(data.frame( - correlation = boot_results$t0, - ci_lower = ci$perc[4], - ci_upper = ci$perc[5], - n = nrow(complete_data) - )) -} - -# Compute bootstrap CIs for all EOHI x Calibration correlations -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (PEARSON) ===\n") -bootstrap_results_pearson <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "pearson", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "pearson" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_pearson) - -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (SPEARMAN) ===\n") -bootstrap_results_spearman <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "spearman", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "spearman" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_spearman) - -####==== SUMMARY TABLE ==== - -# Create comprehensive summary table -summary_table <- bootstrap_results_pearson %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper, n) %>% - rename(pearson_r = correlation, pearson_ci_lower = ci_lower, pearson_ci_upper = ci_upper) %>% - left_join( - bootstrap_results_spearman %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper) %>% - rename(spearman_rho = correlation, spearman_ci_lower = ci_lower, spearman_ci_upper = ci_upper), - by = c("eohi_var", "cal_var") - ) %>% - # Add p-values - left_join( - expand.grid(eohi_var = eohi_vars, cal_var = cal_vars, stringsAsFactors = FALSE) %>% - pmap_dfr(function(eohi_var, cal_var) { - pearson_p <- p_matrix_pearson[eohi_var, cal_var] - spearman_p <- p_matrix_spearman[eohi_var, cal_var] - data.frame( - eohi_var = eohi_var, - cal_var = cal_var, - pearson_p = pearson_p, - spearman_p = spearman_p - ) - }), - by = c("eohi_var", "cal_var") - ) %>% - mutate( - pearson_p = round(pearson_p, 5), - spearman_p = round(spearman_p, 5) - ) - -cat("\n=== COMPREHENSIVE SUMMARY TABLE ===\n") -print(summary_table) - -# Save results to CSV -write.csv(summary_table, "eohi_calibration_correlations_summary.csv", row.names = FALSE) -cat("\nResults saved to: eohi_calibration_correlations_summary.csv\n") - -####==== EFFECT SIZES (Cohen's conventions) ==== - -cat("\n=== EFFECT SIZE INTERPRETATION (Cohen's conventions) ===\n") -cat("Small effect: |r| = 0.10\n") -cat("Medium effect: |r| = 0.30\n") -cat("Large effect: |r| = 0.50\n") - -# Categorize effect sizes -summary_table_with_effects <- summary_table %>% - mutate( - pearson_effect_size = case_when( - abs(pearson_r) >= 0.50 ~ "Large", - abs(pearson_r) >= 0.30 ~ "Medium", - abs(pearson_r) >= 0.10 ~ "Small", - TRUE ~ "Negligible" - ), - spearman_effect_size = case_when( - abs(spearman_rho) >= 0.50 ~ "Large", - abs(spearman_rho) >= 0.30 ~ "Medium", - abs(spearman_rho) >= 0.10 ~ "Small", - TRUE ~ "Negligible" - ) - ) - -cat("\n=== EFFECT SIZE CATEGORIZATION ===\n") -print(summary_table_with_effects %>% select(eohi_var, cal_var, pearson_r, pearson_effect_size, spearman_rho, spearman_effect_size)) diff --git a/.history/eohi1/correlations - eohi x calibration_20250915134916.r b/.history/eohi1/correlations - eohi x calibration_20250915134916.r deleted file mode 100644 index f8011c9..0000000 --- a/.history/eohi1/correlations - eohi x calibration_20250915134916.r +++ /dev/null @@ -1,307 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Remove columns with all NA values -df1 <- df1 %>% select(where(~ !all(is.na(.)))) - -# Select variables of interest -eohi_vars <- c("eohi_pref", "eohi_pers", "eohi_val", "eohi_life", "eohi_mean", - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean") -cal_vars <- c("cal_selfActual", "cal_global") - -# Create dataset with selected variables -df <- df1[, c(eohi_vars, cal_vars)] - -# Ensure all selected variables are numeric -df <- df %>% - mutate(across(everything(), as.numeric)) - -# Remove rows with any missing values for correlation analysis -df_complete <- df[complete.cases(df), ] - -cat("Sample size for correlation analysis:", nrow(df_complete), "\n") -cat("Total sample size:", nrow(df), "\n") - -str(df) -summary(df) -####==== DESCRIPTIVE STATISTICS ==== - -# Function to compute descriptive statistics -get_descriptives <- function(data, vars) { - desc_stats <- data %>% - select(all_of(vars)) %>% - summarise(across(everything(), list( - n = ~sum(!is.na(.)), - mean = ~mean(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - max = ~max(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - q25 = ~quantile(., 0.25, na.rm = TRUE), - q75 = ~quantile(., 0.75, na.rm = TRUE) - ))) %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - separate(variable, into = c("var", "stat"), sep = "_(?=[^_]+$)") %>% - pivot_wider(names_from = stat, values_from = value) %>% - mutate(across(c(mean, sd, min, max, median, q25, q75), ~round(., 5))) - - return(desc_stats) -} - -# Get descriptives for EOHI variables -eohi_descriptives <- get_descriptives(df, eohi_vars) -cat("\n=== EOHI Variables Descriptives ===\n") -print(eohi_descriptives) - -# Get descriptives for calibration variables -cal_descriptives <- get_descriptives(df, cal_vars) -cat("\n=== Calibration Variables Descriptives ===\n") -print(cal_descriptives) - -####==== PEARSON CORRELATIONS ==== - -# Compute correlation matrix with p-values -cor_results_pearson <- rcorr(as.matrix(df_complete), type = "pearson") - -# Extract correlation coefficients -cor_pearson <- cor_results_pearson$r - -# Extract p-values -p_matrix_pearson <- cor_results_pearson$P - -# Function to add significance stars -corstars <- function(cor_mat, p_mat) { - stars <- ifelse(p_mat < 0.001, "***", - ifelse(p_mat < 0.01, "**", - ifelse(p_mat < 0.05, "*", ""))) - - # Combine correlation values with stars, rounded to 5 decimal places - cor_with_stars <- matrix(paste0(format(round(cor_mat, 5), nsmall = 5), stars), - nrow = nrow(cor_mat)) - - # Set row and column names - rownames(cor_with_stars) <- rownames(cor_mat) - colnames(cor_with_stars) <- colnames(cor_mat) - - return(cor_with_stars) -} - -# Apply the function -cor_table_pearson <- corstars(cor_pearson, p_matrix_pearson) - -cat("\n=== PEARSON CORRELATIONS ===\n") -print(cor_table_pearson, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations <- cor_pearson[eohi_vars, cal_vars] -eohi_cal_pvalues <- p_matrix_pearson[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Pearson Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations)) { - for(j in 1:ncol(eohi_cal_correlations)) { - cor_val <- eohi_cal_correlations[i, j] - p_val <- eohi_cal_pvalues[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: r = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations)[i], - colnames(eohi_cal_correlations)[j], - cor_val, star, p_val)) - } -} - -####==== SPEARMAN CORRELATIONS ==== - -# Compute Spearman correlation matrix with p-values -cor_results_spearman <- rcorr(as.matrix(df_complete), type = "spearman") - -# Extract correlation coefficients -cor_spearman <- cor_results_spearman$r - -# Extract p-values -p_matrix_spearman <- cor_results_spearman$P - -# Apply the function -cor_table_spearman <- corstars(cor_spearman, p_matrix_spearman) - -cat("\n=== SPEARMAN CORRELATIONS ===\n") -print(cor_table_spearman, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations_spearman <- cor_spearman[eohi_vars, cal_vars] -eohi_cal_pvalues_spearman <- p_matrix_spearman[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Spearman Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations_spearman)) { - for(j in 1:ncol(eohi_cal_correlations_spearman)) { - cor_val <- eohi_cal_correlations_spearman[i, j] - p_val <- eohi_cal_pvalues_spearman[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: rho = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations_spearman)[i], - colnames(eohi_cal_correlations_spearman)[j], - cor_val, star, p_val)) - } -} - -####==== BOOTSTRAPPED 95% CONFIDENCE INTERVALS ==== - -# Function to compute correlation with bootstrap CI -bootstrap_correlation <- function(data, var1, var2, method = "pearson", R = 1000) { - # Remove missing values - complete_data <- data[complete.cases(data[, c(var1, var2)]), ] - - if(nrow(complete_data) < 3) { - return(data.frame( - correlation = NA, - ci_lower = NA, - ci_upper = NA, - n = nrow(complete_data) - )) - } - - # Bootstrap function - boot_fun <- function(data, indices) { - cor(data[indices, var1], data[indices, var2], method = method, use = "complete.obs") - } - - # Perform bootstrap - set.seed(123) # for reproducibility - boot_results <- boot(complete_data, boot_fun, R = R) - - # Calculate confidence interval - ci <- boot.ci(boot_results, type = "perc") - - return(data.frame( - correlation = boot_results$t0, - ci_lower = ci$perc[4], - ci_upper = ci$perc[5], - n = nrow(complete_data) - )) -} - -# Compute bootstrap CIs for all EOHI x Calibration correlations -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (PEARSON) ===\n") -bootstrap_results_pearson <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "pearson", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "pearson" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_pearson) - -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (SPEARMAN) ===\n") -bootstrap_results_spearman <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "spearman", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "spearman" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_spearman) - -####==== SUMMARY TABLE ==== - -# Create comprehensive summary table -summary_table <- bootstrap_results_pearson %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper, n) %>% - rename(pearson_r = correlation, pearson_ci_lower = ci_lower, pearson_ci_upper = ci_upper) %>% - left_join( - bootstrap_results_spearman %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper) %>% - rename(spearman_rho = correlation, spearman_ci_lower = ci_lower, spearman_ci_upper = ci_upper), - by = c("eohi_var", "cal_var") - ) %>% - # Add p-values - left_join( - expand.grid(eohi_var = eohi_vars, cal_var = cal_vars, stringsAsFactors = FALSE) %>% - pmap_dfr(function(eohi_var, cal_var) { - pearson_p <- p_matrix_pearson[eohi_var, cal_var] - spearman_p <- p_matrix_spearman[eohi_var, cal_var] - data.frame( - eohi_var = eohi_var, - cal_var = cal_var, - pearson_p = pearson_p, - spearman_p = spearman_p - ) - }), - by = c("eohi_var", "cal_var") - ) %>% - mutate( - pearson_p = round(pearson_p, 5), - spearman_p = round(spearman_p, 5) - ) - -cat("\n=== COMPREHENSIVE SUMMARY TABLE ===\n") -print(summary_table) - -# Save results to CSV -write.csv(summary_table, "eohi_calibration_correlations_summary.csv", row.names = FALSE) -cat("\nResults saved to: eohi_calibration_correlations_summary.csv\n") - -####==== EFFECT SIZES (Cohen's conventions) ==== - -cat("\n=== EFFECT SIZE INTERPRETATION (Cohen's conventions) ===\n") -cat("Small effect: |r| = 0.10\n") -cat("Medium effect: |r| = 0.30\n") -cat("Large effect: |r| = 0.50\n") - -# Categorize effect sizes -summary_table_with_effects <- summary_table %>% - mutate( - pearson_effect_size = case_when( - abs(pearson_r) >= 0.50 ~ "Large", - abs(pearson_r) >= 0.30 ~ "Medium", - abs(pearson_r) >= 0.10 ~ "Small", - TRUE ~ "Negligible" - ), - spearman_effect_size = case_when( - abs(spearman_rho) >= 0.50 ~ "Large", - abs(spearman_rho) >= 0.30 ~ "Medium", - abs(spearman_rho) >= 0.10 ~ "Small", - TRUE ~ "Negligible" - ) - ) - -cat("\n=== EFFECT SIZE CATEGORIZATION ===\n") -print(summary_table_with_effects %>% select(eohi_var, cal_var, pearson_r, pearson_effect_size, spearman_rho, spearman_effect_size)) diff --git a/.history/eohi1/correlations - eohi x calibration_20250915142201.r b/.history/eohi1/correlations - eohi x calibration_20250915142201.r deleted file mode 100644 index f8011c9..0000000 --- a/.history/eohi1/correlations - eohi x calibration_20250915142201.r +++ /dev/null @@ -1,307 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Remove columns with all NA values -df1 <- df1 %>% select(where(~ !all(is.na(.)))) - -# Select variables of interest -eohi_vars <- c("eohi_pref", "eohi_pers", "eohi_val", "eohi_life", "eohi_mean", - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean") -cal_vars <- c("cal_selfActual", "cal_global") - -# Create dataset with selected variables -df <- df1[, c(eohi_vars, cal_vars)] - -# Ensure all selected variables are numeric -df <- df %>% - mutate(across(everything(), as.numeric)) - -# Remove rows with any missing values for correlation analysis -df_complete <- df[complete.cases(df), ] - -cat("Sample size for correlation analysis:", nrow(df_complete), "\n") -cat("Total sample size:", nrow(df), "\n") - -str(df) -summary(df) -####==== DESCRIPTIVE STATISTICS ==== - -# Function to compute descriptive statistics -get_descriptives <- function(data, vars) { - desc_stats <- data %>% - select(all_of(vars)) %>% - summarise(across(everything(), list( - n = ~sum(!is.na(.)), - mean = ~mean(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - max = ~max(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - q25 = ~quantile(., 0.25, na.rm = TRUE), - q75 = ~quantile(., 0.75, na.rm = TRUE) - ))) %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - separate(variable, into = c("var", "stat"), sep = "_(?=[^_]+$)") %>% - pivot_wider(names_from = stat, values_from = value) %>% - mutate(across(c(mean, sd, min, max, median, q25, q75), ~round(., 5))) - - return(desc_stats) -} - -# Get descriptives for EOHI variables -eohi_descriptives <- get_descriptives(df, eohi_vars) -cat("\n=== EOHI Variables Descriptives ===\n") -print(eohi_descriptives) - -# Get descriptives for calibration variables -cal_descriptives <- get_descriptives(df, cal_vars) -cat("\n=== Calibration Variables Descriptives ===\n") -print(cal_descriptives) - -####==== PEARSON CORRELATIONS ==== - -# Compute correlation matrix with p-values -cor_results_pearson <- rcorr(as.matrix(df_complete), type = "pearson") - -# Extract correlation coefficients -cor_pearson <- cor_results_pearson$r - -# Extract p-values -p_matrix_pearson <- cor_results_pearson$P - -# Function to add significance stars -corstars <- function(cor_mat, p_mat) { - stars <- ifelse(p_mat < 0.001, "***", - ifelse(p_mat < 0.01, "**", - ifelse(p_mat < 0.05, "*", ""))) - - # Combine correlation values with stars, rounded to 5 decimal places - cor_with_stars <- matrix(paste0(format(round(cor_mat, 5), nsmall = 5), stars), - nrow = nrow(cor_mat)) - - # Set row and column names - rownames(cor_with_stars) <- rownames(cor_mat) - colnames(cor_with_stars) <- colnames(cor_mat) - - return(cor_with_stars) -} - -# Apply the function -cor_table_pearson <- corstars(cor_pearson, p_matrix_pearson) - -cat("\n=== PEARSON CORRELATIONS ===\n") -print(cor_table_pearson, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations <- cor_pearson[eohi_vars, cal_vars] -eohi_cal_pvalues <- p_matrix_pearson[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Pearson Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations)) { - for(j in 1:ncol(eohi_cal_correlations)) { - cor_val <- eohi_cal_correlations[i, j] - p_val <- eohi_cal_pvalues[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: r = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations)[i], - colnames(eohi_cal_correlations)[j], - cor_val, star, p_val)) - } -} - -####==== SPEARMAN CORRELATIONS ==== - -# Compute Spearman correlation matrix with p-values -cor_results_spearman <- rcorr(as.matrix(df_complete), type = "spearman") - -# Extract correlation coefficients -cor_spearman <- cor_results_spearman$r - -# Extract p-values -p_matrix_spearman <- cor_results_spearman$P - -# Apply the function -cor_table_spearman <- corstars(cor_spearman, p_matrix_spearman) - -cat("\n=== SPEARMAN CORRELATIONS ===\n") -print(cor_table_spearman, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations_spearman <- cor_spearman[eohi_vars, cal_vars] -eohi_cal_pvalues_spearman <- p_matrix_spearman[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Spearman Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations_spearman)) { - for(j in 1:ncol(eohi_cal_correlations_spearman)) { - cor_val <- eohi_cal_correlations_spearman[i, j] - p_val <- eohi_cal_pvalues_spearman[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: rho = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations_spearman)[i], - colnames(eohi_cal_correlations_spearman)[j], - cor_val, star, p_val)) - } -} - -####==== BOOTSTRAPPED 95% CONFIDENCE INTERVALS ==== - -# Function to compute correlation with bootstrap CI -bootstrap_correlation <- function(data, var1, var2, method = "pearson", R = 1000) { - # Remove missing values - complete_data <- data[complete.cases(data[, c(var1, var2)]), ] - - if(nrow(complete_data) < 3) { - return(data.frame( - correlation = NA, - ci_lower = NA, - ci_upper = NA, - n = nrow(complete_data) - )) - } - - # Bootstrap function - boot_fun <- function(data, indices) { - cor(data[indices, var1], data[indices, var2], method = method, use = "complete.obs") - } - - # Perform bootstrap - set.seed(123) # for reproducibility - boot_results <- boot(complete_data, boot_fun, R = R) - - # Calculate confidence interval - ci <- boot.ci(boot_results, type = "perc") - - return(data.frame( - correlation = boot_results$t0, - ci_lower = ci$perc[4], - ci_upper = ci$perc[5], - n = nrow(complete_data) - )) -} - -# Compute bootstrap CIs for all EOHI x Calibration correlations -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (PEARSON) ===\n") -bootstrap_results_pearson <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "pearson", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "pearson" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_pearson) - -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (SPEARMAN) ===\n") -bootstrap_results_spearman <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "spearman", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "spearman" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_spearman) - -####==== SUMMARY TABLE ==== - -# Create comprehensive summary table -summary_table <- bootstrap_results_pearson %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper, n) %>% - rename(pearson_r = correlation, pearson_ci_lower = ci_lower, pearson_ci_upper = ci_upper) %>% - left_join( - bootstrap_results_spearman %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper) %>% - rename(spearman_rho = correlation, spearman_ci_lower = ci_lower, spearman_ci_upper = ci_upper), - by = c("eohi_var", "cal_var") - ) %>% - # Add p-values - left_join( - expand.grid(eohi_var = eohi_vars, cal_var = cal_vars, stringsAsFactors = FALSE) %>% - pmap_dfr(function(eohi_var, cal_var) { - pearson_p <- p_matrix_pearson[eohi_var, cal_var] - spearman_p <- p_matrix_spearman[eohi_var, cal_var] - data.frame( - eohi_var = eohi_var, - cal_var = cal_var, - pearson_p = pearson_p, - spearman_p = spearman_p - ) - }), - by = c("eohi_var", "cal_var") - ) %>% - mutate( - pearson_p = round(pearson_p, 5), - spearman_p = round(spearman_p, 5) - ) - -cat("\n=== COMPREHENSIVE SUMMARY TABLE ===\n") -print(summary_table) - -# Save results to CSV -write.csv(summary_table, "eohi_calibration_correlations_summary.csv", row.names = FALSE) -cat("\nResults saved to: eohi_calibration_correlations_summary.csv\n") - -####==== EFFECT SIZES (Cohen's conventions) ==== - -cat("\n=== EFFECT SIZE INTERPRETATION (Cohen's conventions) ===\n") -cat("Small effect: |r| = 0.10\n") -cat("Medium effect: |r| = 0.30\n") -cat("Large effect: |r| = 0.50\n") - -# Categorize effect sizes -summary_table_with_effects <- summary_table %>% - mutate( - pearson_effect_size = case_when( - abs(pearson_r) >= 0.50 ~ "Large", - abs(pearson_r) >= 0.30 ~ "Medium", - abs(pearson_r) >= 0.10 ~ "Small", - TRUE ~ "Negligible" - ), - spearman_effect_size = case_when( - abs(spearman_rho) >= 0.50 ~ "Large", - abs(spearman_rho) >= 0.30 ~ "Medium", - abs(spearman_rho) >= 0.10 ~ "Small", - TRUE ~ "Negligible" - ) - ) - -cat("\n=== EFFECT SIZE CATEGORIZATION ===\n") -print(summary_table_with_effects %>% select(eohi_var, cal_var, pearson_r, pearson_effect_size, spearman_rho, spearman_effect_size)) diff --git a/.history/eohi1/correlations - eohi x calibration_20250915142559.r b/.history/eohi1/correlations - eohi x calibration_20250915142559.r deleted file mode 100644 index 4a8461b..0000000 --- a/.history/eohi1/correlations - eohi x calibration_20250915142559.r +++ /dev/null @@ -1,307 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Remove columns with all NA values -df1 <- df1 %>% select(where(~ !all(is.na(.)))) - -# Select variables of interest -eohi_vars <- c("eohi_pref", "eohi_pers", "eohi_val", "eohi_life", "eohi_mean", - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean") -cal_vars <- c("cal_selfActual", "cal_global", "cal_15", "cal_35", "cal_55", "cal_75", "cal_true", "cal_false") - -# Create dataset with selected variables -df <- df1[, c(eohi_vars, cal_vars)] - -# Ensure all selected variables are numeric -df <- df %>% - mutate(across(everything(), as.numeric)) - -# Remove rows with any missing values for correlation analysis -df_complete <- df[complete.cases(df), ] - -cat("Sample size for correlation analysis:", nrow(df_complete), "\n") -cat("Total sample size:", nrow(df), "\n") - -str(df) -summary(df) -####==== DESCRIPTIVE STATISTICS ==== - -# Function to compute descriptive statistics -get_descriptives <- function(data, vars) { - desc_stats <- data %>% - select(all_of(vars)) %>% - summarise(across(everything(), list( - n = ~sum(!is.na(.)), - mean = ~mean(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - max = ~max(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - q25 = ~quantile(., 0.25, na.rm = TRUE), - q75 = ~quantile(., 0.75, na.rm = TRUE) - ))) %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - separate(variable, into = c("var", "stat"), sep = "_(?=[^_]+$)") %>% - pivot_wider(names_from = stat, values_from = value) %>% - mutate(across(c(mean, sd, min, max, median, q25, q75), ~round(., 5))) - - return(desc_stats) -} - -# Get descriptives for EOHI variables -eohi_descriptives <- get_descriptives(df, eohi_vars) -cat("\n=== EOHI Variables Descriptives ===\n") -print(eohi_descriptives) - -# Get descriptives for calibration variables -cal_descriptives <- get_descriptives(df, cal_vars) -cat("\n=== Calibration Variables Descriptives ===\n") -print(cal_descriptives) - -####==== PEARSON CORRELATIONS ==== - -# Compute correlation matrix with p-values -cor_results_pearson <- rcorr(as.matrix(df_complete), type = "pearson") - -# Extract correlation coefficients -cor_pearson <- cor_results_pearson$r - -# Extract p-values -p_matrix_pearson <- cor_results_pearson$P - -# Function to add significance stars -corstars <- function(cor_mat, p_mat) { - stars <- ifelse(p_mat < 0.001, "***", - ifelse(p_mat < 0.01, "**", - ifelse(p_mat < 0.05, "*", ""))) - - # Combine correlation values with stars, rounded to 5 decimal places - cor_with_stars <- matrix(paste0(format(round(cor_mat, 5), nsmall = 5), stars), - nrow = nrow(cor_mat)) - - # Set row and column names - rownames(cor_with_stars) <- rownames(cor_mat) - colnames(cor_with_stars) <- colnames(cor_mat) - - return(cor_with_stars) -} - -# Apply the function -cor_table_pearson <- corstars(cor_pearson, p_matrix_pearson) - -cat("\n=== PEARSON CORRELATIONS ===\n") -print(cor_table_pearson, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations <- cor_pearson[eohi_vars, cal_vars] -eohi_cal_pvalues <- p_matrix_pearson[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Pearson Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations)) { - for(j in 1:ncol(eohi_cal_correlations)) { - cor_val <- eohi_cal_correlations[i, j] - p_val <- eohi_cal_pvalues[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: r = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations)[i], - colnames(eohi_cal_correlations)[j], - cor_val, star, p_val)) - } -} - -####==== SPEARMAN CORRELATIONS ==== - -# Compute Spearman correlation matrix with p-values -cor_results_spearman <- rcorr(as.matrix(df_complete), type = "spearman") - -# Extract correlation coefficients -cor_spearman <- cor_results_spearman$r - -# Extract p-values -p_matrix_spearman <- cor_results_spearman$P - -# Apply the function -cor_table_spearman <- corstars(cor_spearman, p_matrix_spearman) - -cat("\n=== SPEARMAN CORRELATIONS ===\n") -print(cor_table_spearman, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations_spearman <- cor_spearman[eohi_vars, cal_vars] -eohi_cal_pvalues_spearman <- p_matrix_spearman[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Spearman Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations_spearman)) { - for(j in 1:ncol(eohi_cal_correlations_spearman)) { - cor_val <- eohi_cal_correlations_spearman[i, j] - p_val <- eohi_cal_pvalues_spearman[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: rho = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations_spearman)[i], - colnames(eohi_cal_correlations_spearman)[j], - cor_val, star, p_val)) - } -} - -####==== BOOTSTRAPPED 95% CONFIDENCE INTERVALS ==== - -# Function to compute correlation with bootstrap CI -bootstrap_correlation <- function(data, var1, var2, method = "pearson", R = 1000) { - # Remove missing values - complete_data <- data[complete.cases(data[, c(var1, var2)]), ] - - if(nrow(complete_data) < 3) { - return(data.frame( - correlation = NA, - ci_lower = NA, - ci_upper = NA, - n = nrow(complete_data) - )) - } - - # Bootstrap function - boot_fun <- function(data, indices) { - cor(data[indices, var1], data[indices, var2], method = method, use = "complete.obs") - } - - # Perform bootstrap - set.seed(123) # for reproducibility - boot_results <- boot(complete_data, boot_fun, R = R) - - # Calculate confidence interval - ci <- boot.ci(boot_results, type = "perc") - - return(data.frame( - correlation = boot_results$t0, - ci_lower = ci$perc[4], - ci_upper = ci$perc[5], - n = nrow(complete_data) - )) -} - -# Compute bootstrap CIs for all EOHI x Calibration correlations -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (PEARSON) ===\n") -bootstrap_results_pearson <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "pearson", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "pearson" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_pearson) - -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (SPEARMAN) ===\n") -bootstrap_results_spearman <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "spearman", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "spearman" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_spearman) - -####==== SUMMARY TABLE ==== - -# Create comprehensive summary table -summary_table <- bootstrap_results_pearson %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper, n) %>% - rename(pearson_r = correlation, pearson_ci_lower = ci_lower, pearson_ci_upper = ci_upper) %>% - left_join( - bootstrap_results_spearman %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper) %>% - rename(spearman_rho = correlation, spearman_ci_lower = ci_lower, spearman_ci_upper = ci_upper), - by = c("eohi_var", "cal_var") - ) %>% - # Add p-values - left_join( - expand.grid(eohi_var = eohi_vars, cal_var = cal_vars, stringsAsFactors = FALSE) %>% - pmap_dfr(function(eohi_var, cal_var) { - pearson_p <- p_matrix_pearson[eohi_var, cal_var] - spearman_p <- p_matrix_spearman[eohi_var, cal_var] - data.frame( - eohi_var = eohi_var, - cal_var = cal_var, - pearson_p = pearson_p, - spearman_p = spearman_p - ) - }), - by = c("eohi_var", "cal_var") - ) %>% - mutate( - pearson_p = round(pearson_p, 5), - spearman_p = round(spearman_p, 5) - ) - -cat("\n=== COMPREHENSIVE SUMMARY TABLE ===\n") -print(summary_table) - -# Save results to CSV -write.csv(summary_table, "eohi_calibration_correlations_summary.csv", row.names = FALSE) -cat("\nResults saved to: eohi_calibration_correlations_summary.csv\n") - -####==== EFFECT SIZES (Cohen's conventions) ==== - -cat("\n=== EFFECT SIZE INTERPRETATION (Cohen's conventions) ===\n") -cat("Small effect: |r| = 0.10\n") -cat("Medium effect: |r| = 0.30\n") -cat("Large effect: |r| = 0.50\n") - -# Categorize effect sizes -summary_table_with_effects <- summary_table %>% - mutate( - pearson_effect_size = case_when( - abs(pearson_r) >= 0.50 ~ "Large", - abs(pearson_r) >= 0.30 ~ "Medium", - abs(pearson_r) >= 0.10 ~ "Small", - TRUE ~ "Negligible" - ), - spearman_effect_size = case_when( - abs(spearman_rho) >= 0.50 ~ "Large", - abs(spearman_rho) >= 0.30 ~ "Medium", - abs(spearman_rho) >= 0.10 ~ "Small", - TRUE ~ "Negligible" - ) - ) - -cat("\n=== EFFECT SIZE CATEGORIZATION ===\n") -print(summary_table_with_effects %>% select(eohi_var, cal_var, pearson_r, pearson_effect_size, spearman_rho, spearman_effect_size)) diff --git a/.history/eohi1/correlations - eohi x calibration_20250915142603.r b/.history/eohi1/correlations - eohi x calibration_20250915142603.r deleted file mode 100644 index 4a8461b..0000000 --- a/.history/eohi1/correlations - eohi x calibration_20250915142603.r +++ /dev/null @@ -1,307 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Remove columns with all NA values -df1 <- df1 %>% select(where(~ !all(is.na(.)))) - -# Select variables of interest -eohi_vars <- c("eohi_pref", "eohi_pers", "eohi_val", "eohi_life", "eohi_mean", - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean") -cal_vars <- c("cal_selfActual", "cal_global", "cal_15", "cal_35", "cal_55", "cal_75", "cal_true", "cal_false") - -# Create dataset with selected variables -df <- df1[, c(eohi_vars, cal_vars)] - -# Ensure all selected variables are numeric -df <- df %>% - mutate(across(everything(), as.numeric)) - -# Remove rows with any missing values for correlation analysis -df_complete <- df[complete.cases(df), ] - -cat("Sample size for correlation analysis:", nrow(df_complete), "\n") -cat("Total sample size:", nrow(df), "\n") - -str(df) -summary(df) -####==== DESCRIPTIVE STATISTICS ==== - -# Function to compute descriptive statistics -get_descriptives <- function(data, vars) { - desc_stats <- data %>% - select(all_of(vars)) %>% - summarise(across(everything(), list( - n = ~sum(!is.na(.)), - mean = ~mean(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - max = ~max(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - q25 = ~quantile(., 0.25, na.rm = TRUE), - q75 = ~quantile(., 0.75, na.rm = TRUE) - ))) %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - separate(variable, into = c("var", "stat"), sep = "_(?=[^_]+$)") %>% - pivot_wider(names_from = stat, values_from = value) %>% - mutate(across(c(mean, sd, min, max, median, q25, q75), ~round(., 5))) - - return(desc_stats) -} - -# Get descriptives for EOHI variables -eohi_descriptives <- get_descriptives(df, eohi_vars) -cat("\n=== EOHI Variables Descriptives ===\n") -print(eohi_descriptives) - -# Get descriptives for calibration variables -cal_descriptives <- get_descriptives(df, cal_vars) -cat("\n=== Calibration Variables Descriptives ===\n") -print(cal_descriptives) - -####==== PEARSON CORRELATIONS ==== - -# Compute correlation matrix with p-values -cor_results_pearson <- rcorr(as.matrix(df_complete), type = "pearson") - -# Extract correlation coefficients -cor_pearson <- cor_results_pearson$r - -# Extract p-values -p_matrix_pearson <- cor_results_pearson$P - -# Function to add significance stars -corstars <- function(cor_mat, p_mat) { - stars <- ifelse(p_mat < 0.001, "***", - ifelse(p_mat < 0.01, "**", - ifelse(p_mat < 0.05, "*", ""))) - - # Combine correlation values with stars, rounded to 5 decimal places - cor_with_stars <- matrix(paste0(format(round(cor_mat, 5), nsmall = 5), stars), - nrow = nrow(cor_mat)) - - # Set row and column names - rownames(cor_with_stars) <- rownames(cor_mat) - colnames(cor_with_stars) <- colnames(cor_mat) - - return(cor_with_stars) -} - -# Apply the function -cor_table_pearson <- corstars(cor_pearson, p_matrix_pearson) - -cat("\n=== PEARSON CORRELATIONS ===\n") -print(cor_table_pearson, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations <- cor_pearson[eohi_vars, cal_vars] -eohi_cal_pvalues <- p_matrix_pearson[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Pearson Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations)) { - for(j in 1:ncol(eohi_cal_correlations)) { - cor_val <- eohi_cal_correlations[i, j] - p_val <- eohi_cal_pvalues[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: r = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations)[i], - colnames(eohi_cal_correlations)[j], - cor_val, star, p_val)) - } -} - -####==== SPEARMAN CORRELATIONS ==== - -# Compute Spearman correlation matrix with p-values -cor_results_spearman <- rcorr(as.matrix(df_complete), type = "spearman") - -# Extract correlation coefficients -cor_spearman <- cor_results_spearman$r - -# Extract p-values -p_matrix_spearman <- cor_results_spearman$P - -# Apply the function -cor_table_spearman <- corstars(cor_spearman, p_matrix_spearman) - -cat("\n=== SPEARMAN CORRELATIONS ===\n") -print(cor_table_spearman, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations_spearman <- cor_spearman[eohi_vars, cal_vars] -eohi_cal_pvalues_spearman <- p_matrix_spearman[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Spearman Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations_spearman)) { - for(j in 1:ncol(eohi_cal_correlations_spearman)) { - cor_val <- eohi_cal_correlations_spearman[i, j] - p_val <- eohi_cal_pvalues_spearman[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: rho = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations_spearman)[i], - colnames(eohi_cal_correlations_spearman)[j], - cor_val, star, p_val)) - } -} - -####==== BOOTSTRAPPED 95% CONFIDENCE INTERVALS ==== - -# Function to compute correlation with bootstrap CI -bootstrap_correlation <- function(data, var1, var2, method = "pearson", R = 1000) { - # Remove missing values - complete_data <- data[complete.cases(data[, c(var1, var2)]), ] - - if(nrow(complete_data) < 3) { - return(data.frame( - correlation = NA, - ci_lower = NA, - ci_upper = NA, - n = nrow(complete_data) - )) - } - - # Bootstrap function - boot_fun <- function(data, indices) { - cor(data[indices, var1], data[indices, var2], method = method, use = "complete.obs") - } - - # Perform bootstrap - set.seed(123) # for reproducibility - boot_results <- boot(complete_data, boot_fun, R = R) - - # Calculate confidence interval - ci <- boot.ci(boot_results, type = "perc") - - return(data.frame( - correlation = boot_results$t0, - ci_lower = ci$perc[4], - ci_upper = ci$perc[5], - n = nrow(complete_data) - )) -} - -# Compute bootstrap CIs for all EOHI x Calibration correlations -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (PEARSON) ===\n") -bootstrap_results_pearson <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "pearson", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "pearson" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_pearson) - -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (SPEARMAN) ===\n") -bootstrap_results_spearman <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "spearman", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "spearman" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_spearman) - -####==== SUMMARY TABLE ==== - -# Create comprehensive summary table -summary_table <- bootstrap_results_pearson %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper, n) %>% - rename(pearson_r = correlation, pearson_ci_lower = ci_lower, pearson_ci_upper = ci_upper) %>% - left_join( - bootstrap_results_spearman %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper) %>% - rename(spearman_rho = correlation, spearman_ci_lower = ci_lower, spearman_ci_upper = ci_upper), - by = c("eohi_var", "cal_var") - ) %>% - # Add p-values - left_join( - expand.grid(eohi_var = eohi_vars, cal_var = cal_vars, stringsAsFactors = FALSE) %>% - pmap_dfr(function(eohi_var, cal_var) { - pearson_p <- p_matrix_pearson[eohi_var, cal_var] - spearman_p <- p_matrix_spearman[eohi_var, cal_var] - data.frame( - eohi_var = eohi_var, - cal_var = cal_var, - pearson_p = pearson_p, - spearman_p = spearman_p - ) - }), - by = c("eohi_var", "cal_var") - ) %>% - mutate( - pearson_p = round(pearson_p, 5), - spearman_p = round(spearman_p, 5) - ) - -cat("\n=== COMPREHENSIVE SUMMARY TABLE ===\n") -print(summary_table) - -# Save results to CSV -write.csv(summary_table, "eohi_calibration_correlations_summary.csv", row.names = FALSE) -cat("\nResults saved to: eohi_calibration_correlations_summary.csv\n") - -####==== EFFECT SIZES (Cohen's conventions) ==== - -cat("\n=== EFFECT SIZE INTERPRETATION (Cohen's conventions) ===\n") -cat("Small effect: |r| = 0.10\n") -cat("Medium effect: |r| = 0.30\n") -cat("Large effect: |r| = 0.50\n") - -# Categorize effect sizes -summary_table_with_effects <- summary_table %>% - mutate( - pearson_effect_size = case_when( - abs(pearson_r) >= 0.50 ~ "Large", - abs(pearson_r) >= 0.30 ~ "Medium", - abs(pearson_r) >= 0.10 ~ "Small", - TRUE ~ "Negligible" - ), - spearman_effect_size = case_when( - abs(spearman_rho) >= 0.50 ~ "Large", - abs(spearman_rho) >= 0.30 ~ "Medium", - abs(spearman_rho) >= 0.10 ~ "Small", - TRUE ~ "Negligible" - ) - ) - -cat("\n=== EFFECT SIZE CATEGORIZATION ===\n") -print(summary_table_with_effects %>% select(eohi_var, cal_var, pearson_r, pearson_effect_size, spearman_rho, spearman_effect_size)) diff --git a/.history/eohi1/correlations - eohi x calibration_20250916091406.r b/.history/eohi1/correlations - eohi x calibration_20250916091406.r deleted file mode 100644 index cc41f25..0000000 --- a/.history/eohi1/correlations - eohi x calibration_20250916091406.r +++ /dev/null @@ -1,280 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Remove columns with all NA values -df1 <- df1 %>% select(where(~ !all(is.na(.)))) - -# Select variables of interest -eohi_vars <- c("eohi_pref", "eohi_pers", "eohi_val", "eohi_life", "eohi_mean", - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean") -cal_vars <- c("cal_selfActual", "cal_global", "cal_15", "cal_35", "cal_55", "cal_75", "cal_true", "cal_false") - -# Create dataset with selected variables -df <- df1[, c(eohi_vars, cal_vars)] - -# Ensure all selected variables are numeric -df <- df %>% - mutate(across(everything(), as.numeric)) - -# Remove rows with any missing values for correlation analysis -df_complete <- df[complete.cases(df), ] - -cat("Sample size for correlation analysis:", nrow(df_complete), "\n") -cat("Total sample size:", nrow(df), "\n") - -str(df) -summary(df) -####==== DESCRIPTIVE STATISTICS ==== - -# Function to compute descriptive statistics -get_descriptives <- function(data, vars) { - desc_stats <- data %>% - select(all_of(vars)) %>% - summarise(across(everything(), list( - n = ~sum(!is.na(.)), - mean = ~mean(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - max = ~max(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - q25 = ~quantile(., 0.25, na.rm = TRUE), - q75 = ~quantile(., 0.75, na.rm = TRUE) - ))) %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - separate(variable, into = c("var", "stat"), sep = "_(?=[^_]+$)") %>% - pivot_wider(names_from = stat, values_from = value) %>% - mutate(across(c(mean, sd, min, max, median, q25, q75), ~round(., 5))) - - return(desc_stats) -} - -# Get descriptives for EOHI variables -eohi_descriptives <- get_descriptives(df, eohi_vars) -cat("\n=== EOHI Variables Descriptives ===\n") -print(eohi_descriptives) - -# Get descriptives for calibration variables -cal_descriptives <- get_descriptives(df, cal_vars) -cat("\n=== Calibration Variables Descriptives ===\n") -print(cal_descriptives) - -####==== PEARSON CORRELATIONS ==== - -# Compute correlation matrix with p-values -cor_results_pearson <- rcorr(as.matrix(df_complete), type = "pearson") - -# Extract correlation coefficients -cor_pearson <- cor_results_pearson$r - -# Extract p-values -p_matrix_pearson <- cor_results_pearson$P - -# Function to add significance stars -corstars <- function(cor_mat, p_mat) { - stars <- ifelse(p_mat < 0.001, "***", - ifelse(p_mat < 0.01, "**", - ifelse(p_mat < 0.05, "*", ""))) - - # Combine correlation values with stars, rounded to 5 decimal places - cor_with_stars <- matrix(paste0(format(round(cor_mat, 5), nsmall = 5), stars), - nrow = nrow(cor_mat)) - - # Set row and column names - rownames(cor_with_stars) <- rownames(cor_mat) - colnames(cor_with_stars) <- colnames(cor_mat) - - return(cor_with_stars) -} - -# Apply the function -cor_table_pearson <- corstars(cor_pearson, p_matrix_pearson) - -cat("\n=== PEARSON CORRELATIONS ===\n") -print(cor_table_pearson, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations <- cor_pearson[eohi_vars, cal_vars] -eohi_cal_pvalues <- p_matrix_pearson[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Pearson Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations)) { - for(j in 1:ncol(eohi_cal_correlations)) { - cor_val <- eohi_cal_correlations[i, j] - p_val <- eohi_cal_pvalues[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: r = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations)[i], - colnames(eohi_cal_correlations)[j], - cor_val, star, p_val)) - } -} - -####==== SPEARMAN CORRELATIONS ==== - -# Compute Spearman correlation matrix with p-values -cor_results_spearman <- rcorr(as.matrix(df_complete), type = "spearman") - -# Extract correlation coefficients -cor_spearman <- cor_results_spearman$r - -# Extract p-values -p_matrix_spearman <- cor_results_spearman$P - -# Apply the function -cor_table_spearman <- corstars(cor_spearman, p_matrix_spearman) - -cat("\n=== SPEARMAN CORRELATIONS ===\n") -print(cor_table_spearman, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations_spearman <- cor_spearman[eohi_vars, cal_vars] -eohi_cal_pvalues_spearman <- p_matrix_spearman[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Spearman Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations_spearman)) { - for(j in 1:ncol(eohi_cal_correlations_spearman)) { - cor_val <- eohi_cal_correlations_spearman[i, j] - p_val <- eohi_cal_pvalues_spearman[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: rho = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations_spearman)[i], - colnames(eohi_cal_correlations_spearman)[j], - cor_val, star, p_val)) - } -} - -####==== BOOTSTRAPPED 95% CONFIDENCE INTERVALS ==== - -# Function to compute correlation with bootstrap CI -bootstrap_correlation <- function(data, var1, var2, method = "pearson", R = 1000) { - # Remove missing values - complete_data <- data[complete.cases(data[, c(var1, var2)]), ] - - if(nrow(complete_data) < 3) { - return(data.frame( - correlation = NA, - ci_lower = NA, - ci_upper = NA, - n = nrow(complete_data) - )) - } - - # Bootstrap function - boot_fun <- function(data, indices) { - cor(data[indices, var1], data[indices, var2], method = method, use = "complete.obs") - } - - # Perform bootstrap - set.seed(123) # for reproducibility - boot_results <- boot(complete_data, boot_fun, R = R) - - # Calculate confidence interval - ci <- boot.ci(boot_results, type = "perc") - - return(data.frame( - correlation = boot_results$t0, - ci_lower = ci$perc[4], - ci_upper = ci$perc[5], - n = nrow(complete_data) - )) -} - -# Compute bootstrap CIs for all EOHI x Calibration correlations -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (PEARSON) ===\n") -bootstrap_results_pearson <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "pearson", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "pearson" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_pearson) - -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (SPEARMAN) ===\n") -bootstrap_results_spearman <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "spearman", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "spearman" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_spearman) - -####==== SUMMARY TABLE ==== - -# Create comprehensive summary table -summary_table <- bootstrap_results_pearson %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper, n) %>% - rename(pearson_r = correlation, pearson_ci_lower = ci_lower, pearson_ci_upper = ci_upper) %>% - left_join( - bootstrap_results_spearman %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper) %>% - rename(spearman_rho = correlation, spearman_ci_lower = ci_lower, spearman_ci_upper = ci_upper), - by = c("eohi_var", "cal_var") - ) %>% - # Add p-values - left_join( - expand.grid(eohi_var = eohi_vars, cal_var = cal_vars, stringsAsFactors = FALSE) %>% - pmap_dfr(function(eohi_var, cal_var) { - pearson_p <- p_matrix_pearson[eohi_var, cal_var] - spearman_p <- p_matrix_spearman[eohi_var, cal_var] - data.frame( - eohi_var = eohi_var, - cal_var = cal_var, - pearson_p = pearson_p, - spearman_p = spearman_p - ) - }), - by = c("eohi_var", "cal_var") - ) %>% - mutate( - pearson_p = round(pearson_p, 5), - spearman_p = round(spearman_p, 5) - ) - -cat("\n=== COMPREHENSIVE SUMMARY TABLE ===\n") -print(summary_table) - -# Save results to CSV -write.csv(summary_table, "eohi_calibration_correlations_summary.csv", row.names = FALSE) -cat("\nResults saved to: eohi_calibration_correlations_summary.csv\n") diff --git a/.history/eohi1/correlations - eohi x calibration_20250916091630.r b/.history/eohi1/correlations - eohi x calibration_20250916091630.r deleted file mode 100644 index 4f04efd..0000000 --- a/.history/eohi1/correlations - eohi x calibration_20250916091630.r +++ /dev/null @@ -1,280 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Remove columns with all NA values -df1 <- df1 %>% select(where(~ !all(is.na(.)))) - -# Select variables of interest -eohi_vars <- c("eohi_pref", "eohi_pers", "eohi_val", "eohi_life", "eohi_mean", - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean") -cal_vars <- c("cal_selfActual", "cal_global", "cal_15", "cal_35", "cal_55", "cal_75", "cal_true", "cal_false") - -# Create dataset with selected variables -df <- df1[, c(eohi_vars, cal_vars)] - -# Ensure all selected variables are numeric -df <- df %>% - mutate(across(everything(), as.numeric)) - -# Remove rows with any missing values for correlation analysis -df_complete <- df[complete.cases(df), ] - -cat("Sample size for correlation analysis:", nrow(df_complete), "\n") -cat("Total sample size:", nrow(df), "\n") - -str(df) -summary(df) -####==== DESCRIPTIVE STATISTICS ==== - -# Function to compute descriptive statistics -get_descriptives <- function(data, vars) { - desc_stats <- data %>% - select(all_of(vars)) %>% - summarise(across(everything(), list( - n = ~sum(!is.na(.)), - mean = ~mean(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - max = ~max(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - q25 = ~quantile(., 0.25, na.rm = TRUE), - q75 = ~quantile(., 0.75, na.rm = TRUE) - ))) %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - separate(variable, into = c("var", "stat"), sep = "_(?=[^_]+$)") %>% - pivot_wider(names_from = stat, values_from = value) %>% - mutate(across(c(mean, sd, min, max, median, q25, q75), ~round(., 5))) - - return(desc_stats) -} - -# Get descriptives for EOHI variables -eohi_descriptives <- get_descriptives(df, eohi_vars) -cat("\n=== EOHI Variables Descriptives ===\n") -print(eohi_descriptives) - -# Get descriptives for calibration variables -cal_descriptives <- get_descriptives(df, cal_vars) -cat("\n=== Calibration Variables Descriptives ===\n") -print(cal_descriptives) - -####==== PEARSON CORRELATIONS ==== - -# Compute correlation matrix with p-values -cor_results_pearson <- rcorr(as.matrix(df_complete), type = "pearson") - -# Extract correlation coefficients -cor_pearson <- cor_results_pearson$r - -# Extract p-values -p_matrix_pearson <- cor_results_pearson$P - -# Function to add significance stars -corstars <- function(cor_mat, p_mat) { - stars <- ifelse(p_mat < 0.001, "***", - ifelse(p_mat < 0.01, "**", - ifelse(p_mat < 0.05, "*", ""))) - - # Combine correlation values with stars, rounded to 5 decimal places - cor_with_stars <- matrix(paste0(format(round(cor_mat, 5), nsmall = 5), stars), - nrow = nrow(cor_mat)) - - # Set row and column names - rownames(cor_with_stars) <- rownames(cor_mat) - colnames(cor_with_stars) <- colnames(cor_mat) - - return(cor_with_stars) -} - -# Apply the function -cor_table_pearson <- corstars(cor_pearson, p_matrix_pearson) - -cat("\n=== PEARSON CORRELATIONS ===\n") -print(cor_table_pearson, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations <- cor_pearson[eohi_vars, cal_vars] -eohi_cal_pvalues <- p_matrix_pearson[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Pearson Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations)) { - for(j in 1:ncol(eohi_cal_correlations)) { - cor_val <- eohi_cal_correlations[i, j] - p_val <- eohi_cal_pvalues[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: r = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations)[i], - colnames(eohi_cal_correlations)[j], - cor_val, star, p_val)) - } -} - -####==== SPEARMAN CORRELATIONS ==== - -# Compute Spearman correlation matrix with p-values -cor_results_spearman <- rcorr(as.matrix(df_complete), type = "spearman") - -# Extract correlation coefficients -cor_spearman <- cor_results_spearman$r - -# Extract p-values -p_matrix_spearman <- cor_results_spearman$P - -# Apply the function -cor_table_spearman <- corstars(cor_spearman, p_matrix_spearman) - -cat("\n=== SPEARMAN CORRELATIONS ===\n") -print(cor_table_spearman, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations_spearman <- cor_spearman[eohi_vars, cal_vars] -eohi_cal_pvalues_spearman <- p_matrix_spearman[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Spearman Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations_spearman)) { - for(j in 1:ncol(eohi_cal_correlations_spearman)) { - cor_val <- eohi_cal_correlations_spearman[i, j] - p_val <- eohi_cal_pvalues_spearman[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: rho = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations_spearman)[i], - colnames(eohi_cal_correlations_spearman)[j], - cor_val, star, p_val)) - } -} - -####==== BOOTSTRAPPED 95% CONFIDENCE INTERVALS ==== - -# Function to compute correlation with bootstrap CI -bootstrap_correlation <- function(data, var1, var2, method = "pearson", R = 1000) { - # Remove missing values - complete_data <- data[complete.cases(data[, c(var1, var2)]), ] - - if(nrow(complete_data) < 3) { - return(data.frame( - correlation = NA, - ci_lower = NA, - ci_upper = NA, - n = nrow(complete_data) - )) - } - - # Bootstrap function - boot_fun <- function(data, indices) { - cor(data[indices, var1], data[indices, var2], method = method, use = "complete.obs") - } - - # Perform bootstrap - set.seed(123) # for reproducibility - boot_results <- boot(complete_data, boot_fun, R = R) - - # Calculate confidence interval - ci <- boot.ci(boot_results, type = "perc") - - return(data.frame( - correlation = boot_results$t0, - ci_lower = ci$perc[4], - ci_upper = ci$perc[5], - n = nrow(complete_data) - )) -} - -# Compute bootstrap CIs for all EOHI x Calibration correlations -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (PEARSON) ===\n") -bootstrap_results_pearson <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "pearson", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "pearson" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_pearson) - -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (SPEARMAN) ===\n") -bootstrap_results_spearman <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "spearman", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "spearman" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_spearman) - -####==== SUMMARY TABLE ==== - -# Create comprehensive summary table -summary_table <- bootstrap_results_pearson %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper, n) %>% - rename(pearson_r = correlation, pearson_ci_lower = ci_lower, pearson_ci_upper = ci_upper) %>% - left_join( - bootstrap_results_spearman %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper) %>% - rename(spearman_rho = correlation, spearman_ci_lower = ci_lower, spearman_ci_upper = ci_upper), - by = c("eohi_var", "cal_var") - ) %>% - # Add p-values - left_join( - expand.grid(eohi_var = eohi_vars, cal_var = cal_vars, stringsAsFactors = FALSE) %>% - pmap_dfr(function(eohi_var, cal_var) { - pearson_p <- p_matrix_pearson[eohi_var, cal_var] - spearman_p <- p_matrix_spearman[eohi_var, cal_var] - data.frame( - eohi_var = eohi_var, - cal_var = cal_var, - pearson_p = pearson_p, - spearman_p = spearman_p - ) - }), - by = c("eohi_var", "cal_var") - ) %>% - mutate( - pearson_p = round(pearson_p, 5), - spearman_p = round(spearman_p, 5) - ) - -cat("\n=== COMPREHENSIVE SUMMARY TABLE ===\n") -print(summary_table) - -# Save results to CSV -# write.csv(summary_table, "eohi_calibration_correlations_summary.csv", row.names = FALSE) -# cat("\nResults saved to: eohi_calibration_correlations_summary.csv\n") diff --git a/.history/eohi1/correlations - eohi x calibration_20250916092909.r b/.history/eohi1/correlations - eohi x calibration_20250916092909.r deleted file mode 100644 index ee0dfa3..0000000 --- a/.history/eohi1/correlations - eohi x calibration_20250916092909.r +++ /dev/null @@ -1,307 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Remove columns with all NA values -df1 <- df1 %>% select(where(~ !all(is.na(.)))) - -# Select variables of interest -eohi_vars <- c("eohi_pref", "eohi_pers", "eohi_val", "eohi_life", "eohi_mean", - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean") -cal_vars <- c("cal_selfActual", "cal_global", "cal_15", "cal_35", "cal_55", "cal_75", "cal_true", "cal_false") - -# Create dataset with selected variables -df <- df1[, c(eohi_vars, cal_vars)] - -# Ensure all selected variables are numeric -df <- df %>% - mutate(across(everything(), as.numeric)) - -# Remove rows with any missing values for correlation analysis -df_complete <- df[complete.cases(df), ] - -cat("Sample size for correlation analysis:", nrow(df_complete), "\n") -cat("Total sample size:", nrow(df), "\n") - -str(df) -summary(df) -####==== DESCRIPTIVE STATISTICS ==== - -# Function to compute descriptive statistics -get_descriptives <- function(data, vars) { - desc_stats <- data %>% - select(all_of(vars)) %>% - summarise(across(everything(), list( - n = ~sum(!is.na(.)), - mean = ~mean(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - max = ~max(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - q25 = ~quantile(., 0.25, na.rm = TRUE), - q75 = ~quantile(., 0.75, na.rm = TRUE) - ))) %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - separate(variable, into = c("var", "stat"), sep = "_(?=[^_]+$)") %>% - pivot_wider(names_from = stat, values_from = value) %>% - mutate(across(c(mean, sd, min, max, median, q25, q75), ~round(., 5))) - - return(desc_stats) -} - -# Get descriptives for EOHI variables -eohi_descriptives <- get_descriptives(df, eohi_vars) -cat("\n=== EOHI Variables Descriptives ===\n") -print(eohi_descriptives) - -# Get descriptives for calibration variables -cal_descriptives <- get_descriptives(df, cal_vars) -cat("\n=== Calibration Variables Descriptives ===\n") -print(cal_descriptives) - -# Debug mode - check distributions and outliers -browser() # This will pause execution and enter debug mode - -# Check for bimodal or unusual distributions -hist(df$eohi_pref, main = "EOHI Preference Distribution", xlab = "EOHI Preference") -hist(df$eohi_pers, main = "EOHI Personal Distribution", xlab = "EOHI Personal") -hist(df$eohi_val, main = "EOHI Value Distribution", xlab = "EOHI Value") -hist(df$eohi_life, main = "EOHI Life Distribution", xlab = "EOHI Life") - -# Check for extreme values -boxplot(df$eohi_pref, main = "EOHI Preference Boxplot") -boxplot(df$eohi_pers, main = "EOHI Personal Boxplot") -boxplot(df$eohi_val, main = "EOHI Value Boxplot") -boxplot(df$eohi_life, main = "EOHI Life Boxplot") - -# Check calibration variables -hist(df$cal_selfActual, main = "Calibration Self-Actual Distribution", xlab = "Calibration Self-Actual") -boxplot(df$cal_selfActual, main = "Calibration Self-Actual Boxplot") - -# Check for bimodal or unusual distributions -hist(df$eohi_pref) -hist(df$cal_selfActual) - -# Look for extreme values -boxplot(df$eohi_pref) -boxplot(df$cal_selfActual) - -####==== PEARSON CORRELATIONS ==== - -# Compute correlation matrix with p-values -cor_results_pearson <- rcorr(as.matrix(df_complete), type = "pearson") - -# Extract correlation coefficients -cor_pearson <- cor_results_pearson$r - -# Extract p-values -p_matrix_pearson <- cor_results_pearson$P - -# Function to add significance stars -corstars <- function(cor_mat, p_mat) { - stars <- ifelse(p_mat < 0.001, "***", - ifelse(p_mat < 0.01, "**", - ifelse(p_mat < 0.05, "*", ""))) - - # Combine correlation values with stars, rounded to 5 decimal places - cor_with_stars <- matrix(paste0(format(round(cor_mat, 5), nsmall = 5), stars), - nrow = nrow(cor_mat)) - - # Set row and column names - rownames(cor_with_stars) <- rownames(cor_mat) - colnames(cor_with_stars) <- colnames(cor_mat) - - return(cor_with_stars) -} - -# Apply the function -cor_table_pearson <- corstars(cor_pearson, p_matrix_pearson) - -cat("\n=== PEARSON CORRELATIONS ===\n") -print(cor_table_pearson, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations <- cor_pearson[eohi_vars, cal_vars] -eohi_cal_pvalues <- p_matrix_pearson[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Pearson Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations)) { - for(j in 1:ncol(eohi_cal_correlations)) { - cor_val <- eohi_cal_correlations[i, j] - p_val <- eohi_cal_pvalues[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: r = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations)[i], - colnames(eohi_cal_correlations)[j], - cor_val, star, p_val)) - } -} - -####==== SPEARMAN CORRELATIONS ==== - -# Compute Spearman correlation matrix with p-values -cor_results_spearman <- rcorr(as.matrix(df_complete), type = "spearman") - -# Extract correlation coefficients -cor_spearman <- cor_results_spearman$r - -# Extract p-values -p_matrix_spearman <- cor_results_spearman$P - -# Apply the function -cor_table_spearman <- corstars(cor_spearman, p_matrix_spearman) - -cat("\n=== SPEARMAN CORRELATIONS ===\n") -print(cor_table_spearman, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations_spearman <- cor_spearman[eohi_vars, cal_vars] -eohi_cal_pvalues_spearman <- p_matrix_spearman[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Spearman Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations_spearman)) { - for(j in 1:ncol(eohi_cal_correlations_spearman)) { - cor_val <- eohi_cal_correlations_spearman[i, j] - p_val <- eohi_cal_pvalues_spearman[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: rho = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations_spearman)[i], - colnames(eohi_cal_correlations_spearman)[j], - cor_val, star, p_val)) - } -} - -####==== BOOTSTRAPPED 95% CONFIDENCE INTERVALS ==== - -# Function to compute correlation with bootstrap CI -bootstrap_correlation <- function(data, var1, var2, method = "pearson", R = 1000) { - # Remove missing values - complete_data <- data[complete.cases(data[, c(var1, var2)]), ] - - if(nrow(complete_data) < 3) { - return(data.frame( - correlation = NA, - ci_lower = NA, - ci_upper = NA, - n = nrow(complete_data) - )) - } - - # Bootstrap function - boot_fun <- function(data, indices) { - cor(data[indices, var1], data[indices, var2], method = method, use = "complete.obs") - } - - # Perform bootstrap - set.seed(123) # for reproducibility - boot_results <- boot(complete_data, boot_fun, R = R) - - # Calculate confidence interval - ci <- boot.ci(boot_results, type = "perc") - - return(data.frame( - correlation = boot_results$t0, - ci_lower = ci$perc[4], - ci_upper = ci$perc[5], - n = nrow(complete_data) - )) -} - -# Compute bootstrap CIs for all EOHI x Calibration correlations -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (PEARSON) ===\n") -bootstrap_results_pearson <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "pearson", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "pearson" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_pearson) - -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (SPEARMAN) ===\n") -bootstrap_results_spearman <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "spearman", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "spearman" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_spearman) - -####==== SUMMARY TABLE ==== - -# Create comprehensive summary table -summary_table <- bootstrap_results_pearson %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper, n) %>% - rename(pearson_r = correlation, pearson_ci_lower = ci_lower, pearson_ci_upper = ci_upper) %>% - left_join( - bootstrap_results_spearman %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper) %>% - rename(spearman_rho = correlation, spearman_ci_lower = ci_lower, spearman_ci_upper = ci_upper), - by = c("eohi_var", "cal_var") - ) %>% - # Add p-values - left_join( - expand.grid(eohi_var = eohi_vars, cal_var = cal_vars, stringsAsFactors = FALSE) %>% - pmap_dfr(function(eohi_var, cal_var) { - pearson_p <- p_matrix_pearson[eohi_var, cal_var] - spearman_p <- p_matrix_spearman[eohi_var, cal_var] - data.frame( - eohi_var = eohi_var, - cal_var = cal_var, - pearson_p = pearson_p, - spearman_p = spearman_p - ) - }), - by = c("eohi_var", "cal_var") - ) %>% - mutate( - pearson_p = round(pearson_p, 5), - spearman_p = round(spearman_p, 5) - ) - -cat("\n=== COMPREHENSIVE SUMMARY TABLE ===\n") -print(summary_table) - -# Save results to CSV -# write.csv(summary_table, "eohi_calibration_correlations_summary.csv", row.names = FALSE) -# cat("\nResults saved to: eohi_calibration_correlations_summary.csv\n") diff --git a/.history/eohi1/correlations - eohi x calibration_20250916092913.r b/.history/eohi1/correlations - eohi x calibration_20250916092913.r deleted file mode 100644 index ff153e7..0000000 --- a/.history/eohi1/correlations - eohi x calibration_20250916092913.r +++ /dev/null @@ -1,303 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Remove columns with all NA values -df1 <- df1 %>% select(where(~ !all(is.na(.)))) - -# Select variables of interest -eohi_vars <- c("eohi_pref", "eohi_pers", "eohi_val", "eohi_life", "eohi_mean", - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean") -cal_vars <- c("cal_selfActual", "cal_global", "cal_15", "cal_35", "cal_55", "cal_75", "cal_true", "cal_false") - -# Create dataset with selected variables -df <- df1[, c(eohi_vars, cal_vars)] - -# Ensure all selected variables are numeric -df <- df %>% - mutate(across(everything(), as.numeric)) - -# Remove rows with any missing values for correlation analysis -df_complete <- df[complete.cases(df), ] - -cat("Sample size for correlation analysis:", nrow(df_complete), "\n") -cat("Total sample size:", nrow(df), "\n") - -str(df) -summary(df) -####==== DESCRIPTIVE STATISTICS ==== - -# Function to compute descriptive statistics -get_descriptives <- function(data, vars) { - desc_stats <- data %>% - select(all_of(vars)) %>% - summarise(across(everything(), list( - n = ~sum(!is.na(.)), - mean = ~mean(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - max = ~max(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - q25 = ~quantile(., 0.25, na.rm = TRUE), - q75 = ~quantile(., 0.75, na.rm = TRUE) - ))) %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - separate(variable, into = c("var", "stat"), sep = "_(?=[^_]+$)") %>% - pivot_wider(names_from = stat, values_from = value) %>% - mutate(across(c(mean, sd, min, max, median, q25, q75), ~round(., 5))) - - return(desc_stats) -} - -# Get descriptives for EOHI variables -eohi_descriptives <- get_descriptives(df, eohi_vars) -cat("\n=== EOHI Variables Descriptives ===\n") -print(eohi_descriptives) - -# Get descriptives for calibration variables -cal_descriptives <- get_descriptives(df, cal_vars) -cat("\n=== Calibration Variables Descriptives ===\n") -print(cal_descriptives) - -# Debug mode - check distributions and outliers -browser() # This will pause execution and enter debug mode - -# Check for bimodal or unusual distributions -hist(df$eohi_pref, main = "EOHI Preference Distribution", xlab = "EOHI Preference") -hist(df$eohi_pers, main = "EOHI Personal Distribution", xlab = "EOHI Personal") -hist(df$eohi_val, main = "EOHI Value Distribution", xlab = "EOHI Value") -hist(df$eohi_life, main = "EOHI Life Distribution", xlab = "EOHI Life") - -# Check for extreme values -boxplot(df$eohi_pref, main = "EOHI Preference Boxplot") -boxplot(df$eohi_pers, main = "EOHI Personal Boxplot") -boxplot(df$eohi_val, main = "EOHI Value Boxplot") -boxplot(df$eohi_life, main = "EOHI Life Boxplot") - -# Check calibration variables -hist(df$cal_selfActual, main = "Calibration Self-Actual Distribution", xlab = "Calibration Self-Actual") -boxplot(df$cal_selfActual, main = "Calibration Self-Actual Boxplot") - -# Look for extreme values -boxplot(df$eohi_pref) -boxplot(df$cal_selfActual) - -####==== PEARSON CORRELATIONS ==== - -# Compute correlation matrix with p-values -cor_results_pearson <- rcorr(as.matrix(df_complete), type = "pearson") - -# Extract correlation coefficients -cor_pearson <- cor_results_pearson$r - -# Extract p-values -p_matrix_pearson <- cor_results_pearson$P - -# Function to add significance stars -corstars <- function(cor_mat, p_mat) { - stars <- ifelse(p_mat < 0.001, "***", - ifelse(p_mat < 0.01, "**", - ifelse(p_mat < 0.05, "*", ""))) - - # Combine correlation values with stars, rounded to 5 decimal places - cor_with_stars <- matrix(paste0(format(round(cor_mat, 5), nsmall = 5), stars), - nrow = nrow(cor_mat)) - - # Set row and column names - rownames(cor_with_stars) <- rownames(cor_mat) - colnames(cor_with_stars) <- colnames(cor_mat) - - return(cor_with_stars) -} - -# Apply the function -cor_table_pearson <- corstars(cor_pearson, p_matrix_pearson) - -cat("\n=== PEARSON CORRELATIONS ===\n") -print(cor_table_pearson, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations <- cor_pearson[eohi_vars, cal_vars] -eohi_cal_pvalues <- p_matrix_pearson[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Pearson Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations)) { - for(j in 1:ncol(eohi_cal_correlations)) { - cor_val <- eohi_cal_correlations[i, j] - p_val <- eohi_cal_pvalues[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: r = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations)[i], - colnames(eohi_cal_correlations)[j], - cor_val, star, p_val)) - } -} - -####==== SPEARMAN CORRELATIONS ==== - -# Compute Spearman correlation matrix with p-values -cor_results_spearman <- rcorr(as.matrix(df_complete), type = "spearman") - -# Extract correlation coefficients -cor_spearman <- cor_results_spearman$r - -# Extract p-values -p_matrix_spearman <- cor_results_spearman$P - -# Apply the function -cor_table_spearman <- corstars(cor_spearman, p_matrix_spearman) - -cat("\n=== SPEARMAN CORRELATIONS ===\n") -print(cor_table_spearman, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations_spearman <- cor_spearman[eohi_vars, cal_vars] -eohi_cal_pvalues_spearman <- p_matrix_spearman[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Spearman Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations_spearman)) { - for(j in 1:ncol(eohi_cal_correlations_spearman)) { - cor_val <- eohi_cal_correlations_spearman[i, j] - p_val <- eohi_cal_pvalues_spearman[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: rho = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations_spearman)[i], - colnames(eohi_cal_correlations_spearman)[j], - cor_val, star, p_val)) - } -} - -####==== BOOTSTRAPPED 95% CONFIDENCE INTERVALS ==== - -# Function to compute correlation with bootstrap CI -bootstrap_correlation <- function(data, var1, var2, method = "pearson", R = 1000) { - # Remove missing values - complete_data <- data[complete.cases(data[, c(var1, var2)]), ] - - if(nrow(complete_data) < 3) { - return(data.frame( - correlation = NA, - ci_lower = NA, - ci_upper = NA, - n = nrow(complete_data) - )) - } - - # Bootstrap function - boot_fun <- function(data, indices) { - cor(data[indices, var1], data[indices, var2], method = method, use = "complete.obs") - } - - # Perform bootstrap - set.seed(123) # for reproducibility - boot_results <- boot(complete_data, boot_fun, R = R) - - # Calculate confidence interval - ci <- boot.ci(boot_results, type = "perc") - - return(data.frame( - correlation = boot_results$t0, - ci_lower = ci$perc[4], - ci_upper = ci$perc[5], - n = nrow(complete_data) - )) -} - -# Compute bootstrap CIs for all EOHI x Calibration correlations -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (PEARSON) ===\n") -bootstrap_results_pearson <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "pearson", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "pearson" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_pearson) - -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (SPEARMAN) ===\n") -bootstrap_results_spearman <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "spearman", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "spearman" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_spearman) - -####==== SUMMARY TABLE ==== - -# Create comprehensive summary table -summary_table <- bootstrap_results_pearson %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper, n) %>% - rename(pearson_r = correlation, pearson_ci_lower = ci_lower, pearson_ci_upper = ci_upper) %>% - left_join( - bootstrap_results_spearman %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper) %>% - rename(spearman_rho = correlation, spearman_ci_lower = ci_lower, spearman_ci_upper = ci_upper), - by = c("eohi_var", "cal_var") - ) %>% - # Add p-values - left_join( - expand.grid(eohi_var = eohi_vars, cal_var = cal_vars, stringsAsFactors = FALSE) %>% - pmap_dfr(function(eohi_var, cal_var) { - pearson_p <- p_matrix_pearson[eohi_var, cal_var] - spearman_p <- p_matrix_spearman[eohi_var, cal_var] - data.frame( - eohi_var = eohi_var, - cal_var = cal_var, - pearson_p = pearson_p, - spearman_p = spearman_p - ) - }), - by = c("eohi_var", "cal_var") - ) %>% - mutate( - pearson_p = round(pearson_p, 5), - spearman_p = round(spearman_p, 5) - ) - -cat("\n=== COMPREHENSIVE SUMMARY TABLE ===\n") -print(summary_table) - -# Save results to CSV -# write.csv(summary_table, "eohi_calibration_correlations_summary.csv", row.names = FALSE) -# cat("\nResults saved to: eohi_calibration_correlations_summary.csv\n") diff --git a/.history/eohi1/correlations - eohi x calibration_20250916092918.r b/.history/eohi1/correlations - eohi x calibration_20250916092918.r deleted file mode 100644 index c15d5da..0000000 --- a/.history/eohi1/correlations - eohi x calibration_20250916092918.r +++ /dev/null @@ -1,299 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Remove columns with all NA values -df1 <- df1 %>% select(where(~ !all(is.na(.)))) - -# Select variables of interest -eohi_vars <- c("eohi_pref", "eohi_pers", "eohi_val", "eohi_life", "eohi_mean", - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean") -cal_vars <- c("cal_selfActual", "cal_global", "cal_15", "cal_35", "cal_55", "cal_75", "cal_true", "cal_false") - -# Create dataset with selected variables -df <- df1[, c(eohi_vars, cal_vars)] - -# Ensure all selected variables are numeric -df <- df %>% - mutate(across(everything(), as.numeric)) - -# Remove rows with any missing values for correlation analysis -df_complete <- df[complete.cases(df), ] - -cat("Sample size for correlation analysis:", nrow(df_complete), "\n") -cat("Total sample size:", nrow(df), "\n") - -str(df) -summary(df) -####==== DESCRIPTIVE STATISTICS ==== - -# Function to compute descriptive statistics -get_descriptives <- function(data, vars) { - desc_stats <- data %>% - select(all_of(vars)) %>% - summarise(across(everything(), list( - n = ~sum(!is.na(.)), - mean = ~mean(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - max = ~max(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - q25 = ~quantile(., 0.25, na.rm = TRUE), - q75 = ~quantile(., 0.75, na.rm = TRUE) - ))) %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - separate(variable, into = c("var", "stat"), sep = "_(?=[^_]+$)") %>% - pivot_wider(names_from = stat, values_from = value) %>% - mutate(across(c(mean, sd, min, max, median, q25, q75), ~round(., 5))) - - return(desc_stats) -} - -# Get descriptives for EOHI variables -eohi_descriptives <- get_descriptives(df, eohi_vars) -cat("\n=== EOHI Variables Descriptives ===\n") -print(eohi_descriptives) - -# Get descriptives for calibration variables -cal_descriptives <- get_descriptives(df, cal_vars) -cat("\n=== Calibration Variables Descriptives ===\n") -print(cal_descriptives) - -# Debug mode - check distributions and outliers -browser() # This will pause execution and enter debug mode - -# Check for bimodal or unusual distributions -hist(df$eohi_pref, main = "EOHI Preference Distribution", xlab = "EOHI Preference") -hist(df$eohi_pers, main = "EOHI Personal Distribution", xlab = "EOHI Personal") -hist(df$eohi_val, main = "EOHI Value Distribution", xlab = "EOHI Value") -hist(df$eohi_life, main = "EOHI Life Distribution", xlab = "EOHI Life") - -# Check for extreme values -boxplot(df$eohi_pref, main = "EOHI Preference Boxplot") -boxplot(df$eohi_pers, main = "EOHI Personal Boxplot") -boxplot(df$eohi_val, main = "EOHI Value Boxplot") -boxplot(df$eohi_life, main = "EOHI Life Boxplot") - -# Check calibration variables -hist(df$cal_selfActual, main = "Calibration Self-Actual Distribution", xlab = "Calibration Self-Actual") -boxplot(df$cal_selfActual, main = "Calibration Self-Actual Boxplot") - -####==== PEARSON CORRELATIONS ==== - -# Compute correlation matrix with p-values -cor_results_pearson <- rcorr(as.matrix(df_complete), type = "pearson") - -# Extract correlation coefficients -cor_pearson <- cor_results_pearson$r - -# Extract p-values -p_matrix_pearson <- cor_results_pearson$P - -# Function to add significance stars -corstars <- function(cor_mat, p_mat) { - stars <- ifelse(p_mat < 0.001, "***", - ifelse(p_mat < 0.01, "**", - ifelse(p_mat < 0.05, "*", ""))) - - # Combine correlation values with stars, rounded to 5 decimal places - cor_with_stars <- matrix(paste0(format(round(cor_mat, 5), nsmall = 5), stars), - nrow = nrow(cor_mat)) - - # Set row and column names - rownames(cor_with_stars) <- rownames(cor_mat) - colnames(cor_with_stars) <- colnames(cor_mat) - - return(cor_with_stars) -} - -# Apply the function -cor_table_pearson <- corstars(cor_pearson, p_matrix_pearson) - -cat("\n=== PEARSON CORRELATIONS ===\n") -print(cor_table_pearson, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations <- cor_pearson[eohi_vars, cal_vars] -eohi_cal_pvalues <- p_matrix_pearson[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Pearson Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations)) { - for(j in 1:ncol(eohi_cal_correlations)) { - cor_val <- eohi_cal_correlations[i, j] - p_val <- eohi_cal_pvalues[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: r = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations)[i], - colnames(eohi_cal_correlations)[j], - cor_val, star, p_val)) - } -} - -####==== SPEARMAN CORRELATIONS ==== - -# Compute Spearman correlation matrix with p-values -cor_results_spearman <- rcorr(as.matrix(df_complete), type = "spearman") - -# Extract correlation coefficients -cor_spearman <- cor_results_spearman$r - -# Extract p-values -p_matrix_spearman <- cor_results_spearman$P - -# Apply the function -cor_table_spearman <- corstars(cor_spearman, p_matrix_spearman) - -cat("\n=== SPEARMAN CORRELATIONS ===\n") -print(cor_table_spearman, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations_spearman <- cor_spearman[eohi_vars, cal_vars] -eohi_cal_pvalues_spearman <- p_matrix_spearman[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Spearman Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations_spearman)) { - for(j in 1:ncol(eohi_cal_correlations_spearman)) { - cor_val <- eohi_cal_correlations_spearman[i, j] - p_val <- eohi_cal_pvalues_spearman[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: rho = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations_spearman)[i], - colnames(eohi_cal_correlations_spearman)[j], - cor_val, star, p_val)) - } -} - -####==== BOOTSTRAPPED 95% CONFIDENCE INTERVALS ==== - -# Function to compute correlation with bootstrap CI -bootstrap_correlation <- function(data, var1, var2, method = "pearson", R = 1000) { - # Remove missing values - complete_data <- data[complete.cases(data[, c(var1, var2)]), ] - - if(nrow(complete_data) < 3) { - return(data.frame( - correlation = NA, - ci_lower = NA, - ci_upper = NA, - n = nrow(complete_data) - )) - } - - # Bootstrap function - boot_fun <- function(data, indices) { - cor(data[indices, var1], data[indices, var2], method = method, use = "complete.obs") - } - - # Perform bootstrap - set.seed(123) # for reproducibility - boot_results <- boot(complete_data, boot_fun, R = R) - - # Calculate confidence interval - ci <- boot.ci(boot_results, type = "perc") - - return(data.frame( - correlation = boot_results$t0, - ci_lower = ci$perc[4], - ci_upper = ci$perc[5], - n = nrow(complete_data) - )) -} - -# Compute bootstrap CIs for all EOHI x Calibration correlations -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (PEARSON) ===\n") -bootstrap_results_pearson <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "pearson", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "pearson" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_pearson) - -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (SPEARMAN) ===\n") -bootstrap_results_spearman <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "spearman", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "spearman" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_spearman) - -####==== SUMMARY TABLE ==== - -# Create comprehensive summary table -summary_table <- bootstrap_results_pearson %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper, n) %>% - rename(pearson_r = correlation, pearson_ci_lower = ci_lower, pearson_ci_upper = ci_upper) %>% - left_join( - bootstrap_results_spearman %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper) %>% - rename(spearman_rho = correlation, spearman_ci_lower = ci_lower, spearman_ci_upper = ci_upper), - by = c("eohi_var", "cal_var") - ) %>% - # Add p-values - left_join( - expand.grid(eohi_var = eohi_vars, cal_var = cal_vars, stringsAsFactors = FALSE) %>% - pmap_dfr(function(eohi_var, cal_var) { - pearson_p <- p_matrix_pearson[eohi_var, cal_var] - spearman_p <- p_matrix_spearman[eohi_var, cal_var] - data.frame( - eohi_var = eohi_var, - cal_var = cal_var, - pearson_p = pearson_p, - spearman_p = spearman_p - ) - }), - by = c("eohi_var", "cal_var") - ) %>% - mutate( - pearson_p = round(pearson_p, 5), - spearman_p = round(spearman_p, 5) - ) - -cat("\n=== COMPREHENSIVE SUMMARY TABLE ===\n") -print(summary_table) - -# Save results to CSV -# write.csv(summary_table, "eohi_calibration_correlations_summary.csv", row.names = FALSE) -# cat("\nResults saved to: eohi_calibration_correlations_summary.csv\n") diff --git a/.history/eohi1/correlations - eohi x calibration_20250916092925.r b/.history/eohi1/correlations - eohi x calibration_20250916092925.r deleted file mode 100644 index c15d5da..0000000 --- a/.history/eohi1/correlations - eohi x calibration_20250916092925.r +++ /dev/null @@ -1,299 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Remove columns with all NA values -df1 <- df1 %>% select(where(~ !all(is.na(.)))) - -# Select variables of interest -eohi_vars <- c("eohi_pref", "eohi_pers", "eohi_val", "eohi_life", "eohi_mean", - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean") -cal_vars <- c("cal_selfActual", "cal_global", "cal_15", "cal_35", "cal_55", "cal_75", "cal_true", "cal_false") - -# Create dataset with selected variables -df <- df1[, c(eohi_vars, cal_vars)] - -# Ensure all selected variables are numeric -df <- df %>% - mutate(across(everything(), as.numeric)) - -# Remove rows with any missing values for correlation analysis -df_complete <- df[complete.cases(df), ] - -cat("Sample size for correlation analysis:", nrow(df_complete), "\n") -cat("Total sample size:", nrow(df), "\n") - -str(df) -summary(df) -####==== DESCRIPTIVE STATISTICS ==== - -# Function to compute descriptive statistics -get_descriptives <- function(data, vars) { - desc_stats <- data %>% - select(all_of(vars)) %>% - summarise(across(everything(), list( - n = ~sum(!is.na(.)), - mean = ~mean(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - max = ~max(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - q25 = ~quantile(., 0.25, na.rm = TRUE), - q75 = ~quantile(., 0.75, na.rm = TRUE) - ))) %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - separate(variable, into = c("var", "stat"), sep = "_(?=[^_]+$)") %>% - pivot_wider(names_from = stat, values_from = value) %>% - mutate(across(c(mean, sd, min, max, median, q25, q75), ~round(., 5))) - - return(desc_stats) -} - -# Get descriptives for EOHI variables -eohi_descriptives <- get_descriptives(df, eohi_vars) -cat("\n=== EOHI Variables Descriptives ===\n") -print(eohi_descriptives) - -# Get descriptives for calibration variables -cal_descriptives <- get_descriptives(df, cal_vars) -cat("\n=== Calibration Variables Descriptives ===\n") -print(cal_descriptives) - -# Debug mode - check distributions and outliers -browser() # This will pause execution and enter debug mode - -# Check for bimodal or unusual distributions -hist(df$eohi_pref, main = "EOHI Preference Distribution", xlab = "EOHI Preference") -hist(df$eohi_pers, main = "EOHI Personal Distribution", xlab = "EOHI Personal") -hist(df$eohi_val, main = "EOHI Value Distribution", xlab = "EOHI Value") -hist(df$eohi_life, main = "EOHI Life Distribution", xlab = "EOHI Life") - -# Check for extreme values -boxplot(df$eohi_pref, main = "EOHI Preference Boxplot") -boxplot(df$eohi_pers, main = "EOHI Personal Boxplot") -boxplot(df$eohi_val, main = "EOHI Value Boxplot") -boxplot(df$eohi_life, main = "EOHI Life Boxplot") - -# Check calibration variables -hist(df$cal_selfActual, main = "Calibration Self-Actual Distribution", xlab = "Calibration Self-Actual") -boxplot(df$cal_selfActual, main = "Calibration Self-Actual Boxplot") - -####==== PEARSON CORRELATIONS ==== - -# Compute correlation matrix with p-values -cor_results_pearson <- rcorr(as.matrix(df_complete), type = "pearson") - -# Extract correlation coefficients -cor_pearson <- cor_results_pearson$r - -# Extract p-values -p_matrix_pearson <- cor_results_pearson$P - -# Function to add significance stars -corstars <- function(cor_mat, p_mat) { - stars <- ifelse(p_mat < 0.001, "***", - ifelse(p_mat < 0.01, "**", - ifelse(p_mat < 0.05, "*", ""))) - - # Combine correlation values with stars, rounded to 5 decimal places - cor_with_stars <- matrix(paste0(format(round(cor_mat, 5), nsmall = 5), stars), - nrow = nrow(cor_mat)) - - # Set row and column names - rownames(cor_with_stars) <- rownames(cor_mat) - colnames(cor_with_stars) <- colnames(cor_mat) - - return(cor_with_stars) -} - -# Apply the function -cor_table_pearson <- corstars(cor_pearson, p_matrix_pearson) - -cat("\n=== PEARSON CORRELATIONS ===\n") -print(cor_table_pearson, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations <- cor_pearson[eohi_vars, cal_vars] -eohi_cal_pvalues <- p_matrix_pearson[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Pearson Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations)) { - for(j in 1:ncol(eohi_cal_correlations)) { - cor_val <- eohi_cal_correlations[i, j] - p_val <- eohi_cal_pvalues[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: r = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations)[i], - colnames(eohi_cal_correlations)[j], - cor_val, star, p_val)) - } -} - -####==== SPEARMAN CORRELATIONS ==== - -# Compute Spearman correlation matrix with p-values -cor_results_spearman <- rcorr(as.matrix(df_complete), type = "spearman") - -# Extract correlation coefficients -cor_spearman <- cor_results_spearman$r - -# Extract p-values -p_matrix_spearman <- cor_results_spearman$P - -# Apply the function -cor_table_spearman <- corstars(cor_spearman, p_matrix_spearman) - -cat("\n=== SPEARMAN CORRELATIONS ===\n") -print(cor_table_spearman, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations_spearman <- cor_spearman[eohi_vars, cal_vars] -eohi_cal_pvalues_spearman <- p_matrix_spearman[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Spearman Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations_spearman)) { - for(j in 1:ncol(eohi_cal_correlations_spearman)) { - cor_val <- eohi_cal_correlations_spearman[i, j] - p_val <- eohi_cal_pvalues_spearman[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: rho = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations_spearman)[i], - colnames(eohi_cal_correlations_spearman)[j], - cor_val, star, p_val)) - } -} - -####==== BOOTSTRAPPED 95% CONFIDENCE INTERVALS ==== - -# Function to compute correlation with bootstrap CI -bootstrap_correlation <- function(data, var1, var2, method = "pearson", R = 1000) { - # Remove missing values - complete_data <- data[complete.cases(data[, c(var1, var2)]), ] - - if(nrow(complete_data) < 3) { - return(data.frame( - correlation = NA, - ci_lower = NA, - ci_upper = NA, - n = nrow(complete_data) - )) - } - - # Bootstrap function - boot_fun <- function(data, indices) { - cor(data[indices, var1], data[indices, var2], method = method, use = "complete.obs") - } - - # Perform bootstrap - set.seed(123) # for reproducibility - boot_results <- boot(complete_data, boot_fun, R = R) - - # Calculate confidence interval - ci <- boot.ci(boot_results, type = "perc") - - return(data.frame( - correlation = boot_results$t0, - ci_lower = ci$perc[4], - ci_upper = ci$perc[5], - n = nrow(complete_data) - )) -} - -# Compute bootstrap CIs for all EOHI x Calibration correlations -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (PEARSON) ===\n") -bootstrap_results_pearson <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "pearson", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "pearson" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_pearson) - -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (SPEARMAN) ===\n") -bootstrap_results_spearman <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "spearman", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "spearman" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_spearman) - -####==== SUMMARY TABLE ==== - -# Create comprehensive summary table -summary_table <- bootstrap_results_pearson %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper, n) %>% - rename(pearson_r = correlation, pearson_ci_lower = ci_lower, pearson_ci_upper = ci_upper) %>% - left_join( - bootstrap_results_spearman %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper) %>% - rename(spearman_rho = correlation, spearman_ci_lower = ci_lower, spearman_ci_upper = ci_upper), - by = c("eohi_var", "cal_var") - ) %>% - # Add p-values - left_join( - expand.grid(eohi_var = eohi_vars, cal_var = cal_vars, stringsAsFactors = FALSE) %>% - pmap_dfr(function(eohi_var, cal_var) { - pearson_p <- p_matrix_pearson[eohi_var, cal_var] - spearman_p <- p_matrix_spearman[eohi_var, cal_var] - data.frame( - eohi_var = eohi_var, - cal_var = cal_var, - pearson_p = pearson_p, - spearman_p = spearman_p - ) - }), - by = c("eohi_var", "cal_var") - ) %>% - mutate( - pearson_p = round(pearson_p, 5), - spearman_p = round(spearman_p, 5) - ) - -cat("\n=== COMPREHENSIVE SUMMARY TABLE ===\n") -print(summary_table) - -# Save results to CSV -# write.csv(summary_table, "eohi_calibration_correlations_summary.csv", row.names = FALSE) -# cat("\nResults saved to: eohi_calibration_correlations_summary.csv\n") diff --git a/.history/eohi1/correlations - eohi x calibration_20250916092959.r b/.history/eohi1/correlations - eohi x calibration_20250916092959.r deleted file mode 100644 index 0aa75f4..0000000 --- a/.history/eohi1/correlations - eohi x calibration_20250916092959.r +++ /dev/null @@ -1,288 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Remove columns with all NA values -df1 <- df1 %>% select(where(~ !all(is.na(.)))) - -# Select variables of interest -eohi_vars <- c("eohi_pref", "eohi_pers", "eohi_val", "eohi_life", "eohi_mean", - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean") -cal_vars <- c("cal_selfActual", "cal_global", "cal_15", "cal_35", "cal_55", "cal_75", "cal_true", "cal_false") - -# Create dataset with selected variables -df <- df1[, c(eohi_vars, cal_vars)] - -# Ensure all selected variables are numeric -df <- df %>% - mutate(across(everything(), as.numeric)) - -# Remove rows with any missing values for correlation analysis -df_complete <- df[complete.cases(df), ] - -cat("Sample size for correlation analysis:", nrow(df_complete), "\n") -cat("Total sample size:", nrow(df), "\n") - -str(df) -summary(df) -####==== DESCRIPTIVE STATISTICS ==== - -# Function to compute descriptive statistics -get_descriptives <- function(data, vars) { - desc_stats <- data %>% - select(all_of(vars)) %>% - summarise(across(everything(), list( - n = ~sum(!is.na(.)), - mean = ~mean(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - max = ~max(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - q25 = ~quantile(., 0.25, na.rm = TRUE), - q75 = ~quantile(., 0.75, na.rm = TRUE) - ))) %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - separate(variable, into = c("var", "stat"), sep = "_(?=[^_]+$)") %>% - pivot_wider(names_from = stat, values_from = value) %>% - mutate(across(c(mean, sd, min, max, median, q25, q75), ~round(., 5))) - - return(desc_stats) -} - -# Get descriptives for EOHI variables -eohi_descriptives <- get_descriptives(df, eohi_vars) -cat("\n=== EOHI Variables Descriptives ===\n") -print(eohi_descriptives) - -# Get descriptives for calibration variables -cal_descriptives <- get_descriptives(df, cal_vars) -cat("\n=== Calibration Variables Descriptives ===\n") -print(cal_descriptives) - -# Check for bimodal or unusual distributions -hist(df$eohi_pref) -hist(df$cal_selfActual) - -# Look for extreme values -boxplot(df$eohi_pref) -boxplot(df$cal_selfActual) - -####==== PEARSON CORRELATIONS ==== - -# Compute correlation matrix with p-values -cor_results_pearson <- rcorr(as.matrix(df_complete), type = "pearson") - -# Extract correlation coefficients -cor_pearson <- cor_results_pearson$r - -# Extract p-values -p_matrix_pearson <- cor_results_pearson$P - -# Function to add significance stars -corstars <- function(cor_mat, p_mat) { - stars <- ifelse(p_mat < 0.001, "***", - ifelse(p_mat < 0.01, "**", - ifelse(p_mat < 0.05, "*", ""))) - - # Combine correlation values with stars, rounded to 5 decimal places - cor_with_stars <- matrix(paste0(format(round(cor_mat, 5), nsmall = 5), stars), - nrow = nrow(cor_mat)) - - # Set row and column names - rownames(cor_with_stars) <- rownames(cor_mat) - colnames(cor_with_stars) <- colnames(cor_mat) - - return(cor_with_stars) -} - -# Apply the function -cor_table_pearson <- corstars(cor_pearson, p_matrix_pearson) - -cat("\n=== PEARSON CORRELATIONS ===\n") -print(cor_table_pearson, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations <- cor_pearson[eohi_vars, cal_vars] -eohi_cal_pvalues <- p_matrix_pearson[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Pearson Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations)) { - for(j in 1:ncol(eohi_cal_correlations)) { - cor_val <- eohi_cal_correlations[i, j] - p_val <- eohi_cal_pvalues[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: r = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations)[i], - colnames(eohi_cal_correlations)[j], - cor_val, star, p_val)) - } -} - -####==== SPEARMAN CORRELATIONS ==== - -# Compute Spearman correlation matrix with p-values -cor_results_spearman <- rcorr(as.matrix(df_complete), type = "spearman") - -# Extract correlation coefficients -cor_spearman <- cor_results_spearman$r - -# Extract p-values -p_matrix_spearman <- cor_results_spearman$P - -# Apply the function -cor_table_spearman <- corstars(cor_spearman, p_matrix_spearman) - -cat("\n=== SPEARMAN CORRELATIONS ===\n") -print(cor_table_spearman, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations_spearman <- cor_spearman[eohi_vars, cal_vars] -eohi_cal_pvalues_spearman <- p_matrix_spearman[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Spearman Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations_spearman)) { - for(j in 1:ncol(eohi_cal_correlations_spearman)) { - cor_val <- eohi_cal_correlations_spearman[i, j] - p_val <- eohi_cal_pvalues_spearman[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: rho = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations_spearman)[i], - colnames(eohi_cal_correlations_spearman)[j], - cor_val, star, p_val)) - } -} - -####==== BOOTSTRAPPED 95% CONFIDENCE INTERVALS ==== - -# Function to compute correlation with bootstrap CI -bootstrap_correlation <- function(data, var1, var2, method = "pearson", R = 1000) { - # Remove missing values - complete_data <- data[complete.cases(data[, c(var1, var2)]), ] - - if(nrow(complete_data) < 3) { - return(data.frame( - correlation = NA, - ci_lower = NA, - ci_upper = NA, - n = nrow(complete_data) - )) - } - - # Bootstrap function - boot_fun <- function(data, indices) { - cor(data[indices, var1], data[indices, var2], method = method, use = "complete.obs") - } - - # Perform bootstrap - set.seed(123) # for reproducibility - boot_results <- boot(complete_data, boot_fun, R = R) - - # Calculate confidence interval - ci <- boot.ci(boot_results, type = "perc") - - return(data.frame( - correlation = boot_results$t0, - ci_lower = ci$perc[4], - ci_upper = ci$perc[5], - n = nrow(complete_data) - )) -} - -# Compute bootstrap CIs for all EOHI x Calibration correlations -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (PEARSON) ===\n") -bootstrap_results_pearson <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "pearson", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "pearson" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_pearson) - -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (SPEARMAN) ===\n") -bootstrap_results_spearman <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "spearman", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "spearman" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_spearman) - -####==== SUMMARY TABLE ==== - -# Create comprehensive summary table -summary_table <- bootstrap_results_pearson %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper, n) %>% - rename(pearson_r = correlation, pearson_ci_lower = ci_lower, pearson_ci_upper = ci_upper) %>% - left_join( - bootstrap_results_spearman %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper) %>% - rename(spearman_rho = correlation, spearman_ci_lower = ci_lower, spearman_ci_upper = ci_upper), - by = c("eohi_var", "cal_var") - ) %>% - # Add p-values - left_join( - expand.grid(eohi_var = eohi_vars, cal_var = cal_vars, stringsAsFactors = FALSE) %>% - pmap_dfr(function(eohi_var, cal_var) { - pearson_p <- p_matrix_pearson[eohi_var, cal_var] - spearman_p <- p_matrix_spearman[eohi_var, cal_var] - data.frame( - eohi_var = eohi_var, - cal_var = cal_var, - pearson_p = pearson_p, - spearman_p = spearman_p - ) - }), - by = c("eohi_var", "cal_var") - ) %>% - mutate( - pearson_p = round(pearson_p, 5), - spearman_p = round(spearman_p, 5) - ) - -cat("\n=== COMPREHENSIVE SUMMARY TABLE ===\n") -print(summary_table) - -# Save results to CSV -# write.csv(summary_table, "eohi_calibration_correlations_summary.csv", row.names = FALSE) -# cat("\nResults saved to: eohi_calibration_correlations_summary.csv\n") diff --git a/.history/eohi1/correlations - eohi x calibration_20250916093530.r b/.history/eohi1/correlations - eohi x calibration_20250916093530.r deleted file mode 100644 index 095291b..0000000 --- a/.history/eohi1/correlations - eohi x calibration_20250916093530.r +++ /dev/null @@ -1,288 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Remove columns with all NA values -df1 <- df1 %>% select(where(~ !all(is.na(.)))) - -# Select variables of interest -eohi_vars <- c("eohi_pref", "eohi_pers", "eohi_val", "eohi_life", "eohi_mean", - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean") -cal_vars <- c("cal_selfActual", "cal_global", "cal_15", "cal_35", "cal_55", "cal_75", "cal_true", "cal_false") - -# Create dataset with selected variables -df <- df1[, c(eohi_vars, cal_vars)] - -# Ensure all selected variables are numeric -df <- df %>% - mutate(across(everything(), as.numeric)) - -# Remove rows with any missing values for correlation analysis -df_complete <- df[complete.cases(df), ] - -cat("Sample size for correlation analysis:", nrow(df_complete), "\n") -cat("Total sample size:", nrow(df), "\n") - -str(df) -summary(df) -####==== DESCRIPTIVE STATISTICS ==== - -# Function to compute descriptive statistics -get_descriptives <- function(data, vars) { - desc_stats <- data %>% - select(all_of(vars)) %>% - summarise(across(everything(), list( - n = ~sum(!is.na(.)), - mean = ~mean(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - max = ~max(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - q25 = ~quantile(., 0.25, na.rm = TRUE), - q75 = ~quantile(., 0.75, na.rm = TRUE) - ))) %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - separate(variable, into = c("var", "stat"), sep = "_(?=[^_]+$)") %>% - pivot_wider(names_from = stat, values_from = value) %>% - mutate(across(c(mean, sd, min, max, median, q25, q75), ~round(., 5))) - - return(desc_stats) -} - -# Get descriptives for EOHI variables -eohi_descriptives <- get_descriptives(df, eohi_vars) -cat("\n=== EOHI Variables Descriptives ===\n") -print(eohi_descriptives) - -# Get descriptives for calibration variables -cal_descriptives <- get_descriptives(df, cal_vars) -cat("\n=== Calibration Variables Descriptives ===\n") -print(cal_descriptives) - -# Check for bimodal or unusual distributions -hist(df$eohi_pref) -hist(df$cal_selfActual) - -# Look for extreme values -# boxplot(df$eohi_pref) -# boxplot(df$cal_selfActual) - -####==== PEARSON CORRELATIONS ==== - -# Compute correlation matrix with p-values -cor_results_pearson <- rcorr(as.matrix(df_complete), type = "pearson") - -# Extract correlation coefficients -cor_pearson <- cor_results_pearson$r - -# Extract p-values -p_matrix_pearson <- cor_results_pearson$P - -# Function to add significance stars -corstars <- function(cor_mat, p_mat) { - stars <- ifelse(p_mat < 0.001, "***", - ifelse(p_mat < 0.01, "**", - ifelse(p_mat < 0.05, "*", ""))) - - # Combine correlation values with stars, rounded to 5 decimal places - cor_with_stars <- matrix(paste0(format(round(cor_mat, 5), nsmall = 5), stars), - nrow = nrow(cor_mat)) - - # Set row and column names - rownames(cor_with_stars) <- rownames(cor_mat) - colnames(cor_with_stars) <- colnames(cor_mat) - - return(cor_with_stars) -} - -# Apply the function -cor_table_pearson <- corstars(cor_pearson, p_matrix_pearson) - -cat("\n=== PEARSON CORRELATIONS ===\n") -print(cor_table_pearson, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations <- cor_pearson[eohi_vars, cal_vars] -eohi_cal_pvalues <- p_matrix_pearson[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Pearson Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations)) { - for(j in 1:ncol(eohi_cal_correlations)) { - cor_val <- eohi_cal_correlations[i, j] - p_val <- eohi_cal_pvalues[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: r = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations)[i], - colnames(eohi_cal_correlations)[j], - cor_val, star, p_val)) - } -} - -####==== SPEARMAN CORRELATIONS ==== - -# Compute Spearman correlation matrix with p-values -cor_results_spearman <- rcorr(as.matrix(df_complete), type = "spearman") - -# Extract correlation coefficients -cor_spearman <- cor_results_spearman$r - -# Extract p-values -p_matrix_spearman <- cor_results_spearman$P - -# Apply the function -cor_table_spearman <- corstars(cor_spearman, p_matrix_spearman) - -cat("\n=== SPEARMAN CORRELATIONS ===\n") -print(cor_table_spearman, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations_spearman <- cor_spearman[eohi_vars, cal_vars] -eohi_cal_pvalues_spearman <- p_matrix_spearman[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Spearman Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations_spearman)) { - for(j in 1:ncol(eohi_cal_correlations_spearman)) { - cor_val <- eohi_cal_correlations_spearman[i, j] - p_val <- eohi_cal_pvalues_spearman[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: rho = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations_spearman)[i], - colnames(eohi_cal_correlations_spearman)[j], - cor_val, star, p_val)) - } -} - -####==== BOOTSTRAPPED 95% CONFIDENCE INTERVALS ==== - -# Function to compute correlation with bootstrap CI -bootstrap_correlation <- function(data, var1, var2, method = "pearson", R = 1000) { - # Remove missing values - complete_data <- data[complete.cases(data[, c(var1, var2)]), ] - - if(nrow(complete_data) < 3) { - return(data.frame( - correlation = NA, - ci_lower = NA, - ci_upper = NA, - n = nrow(complete_data) - )) - } - - # Bootstrap function - boot_fun <- function(data, indices) { - cor(data[indices, var1], data[indices, var2], method = method, use = "complete.obs") - } - - # Perform bootstrap - set.seed(123) # for reproducibility - boot_results <- boot(complete_data, boot_fun, R = R) - - # Calculate confidence interval - ci <- boot.ci(boot_results, type = "perc") - - return(data.frame( - correlation = boot_results$t0, - ci_lower = ci$perc[4], - ci_upper = ci$perc[5], - n = nrow(complete_data) - )) -} - -# Compute bootstrap CIs for all EOHI x Calibration correlations -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (PEARSON) ===\n") -bootstrap_results_pearson <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "pearson", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "pearson" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_pearson) - -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (SPEARMAN) ===\n") -bootstrap_results_spearman <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "spearman", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "spearman" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_spearman) - -####==== SUMMARY TABLE ==== - -# Create comprehensive summary table -summary_table <- bootstrap_results_pearson %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper, n) %>% - rename(pearson_r = correlation, pearson_ci_lower = ci_lower, pearson_ci_upper = ci_upper) %>% - left_join( - bootstrap_results_spearman %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper) %>% - rename(spearman_rho = correlation, spearman_ci_lower = ci_lower, spearman_ci_upper = ci_upper), - by = c("eohi_var", "cal_var") - ) %>% - # Add p-values - left_join( - expand.grid(eohi_var = eohi_vars, cal_var = cal_vars, stringsAsFactors = FALSE) %>% - pmap_dfr(function(eohi_var, cal_var) { - pearson_p <- p_matrix_pearson[eohi_var, cal_var] - spearman_p <- p_matrix_spearman[eohi_var, cal_var] - data.frame( - eohi_var = eohi_var, - cal_var = cal_var, - pearson_p = pearson_p, - spearman_p = spearman_p - ) - }), - by = c("eohi_var", "cal_var") - ) %>% - mutate( - pearson_p = round(pearson_p, 5), - spearman_p = round(spearman_p, 5) - ) - -cat("\n=== COMPREHENSIVE SUMMARY TABLE ===\n") -print(summary_table) - -# Save results to CSV -# write.csv(summary_table, "eohi_calibration_correlations_summary.csv", row.names = FALSE) -# cat("\nResults saved to: eohi_calibration_correlations_summary.csv\n") diff --git a/.history/eohi1/correlations - eohi x calibration_20250916095303.r b/.history/eohi1/correlations - eohi x calibration_20250916095303.r deleted file mode 100644 index 095291b..0000000 --- a/.history/eohi1/correlations - eohi x calibration_20250916095303.r +++ /dev/null @@ -1,288 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Remove columns with all NA values -df1 <- df1 %>% select(where(~ !all(is.na(.)))) - -# Select variables of interest -eohi_vars <- c("eohi_pref", "eohi_pers", "eohi_val", "eohi_life", "eohi_mean", - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean") -cal_vars <- c("cal_selfActual", "cal_global", "cal_15", "cal_35", "cal_55", "cal_75", "cal_true", "cal_false") - -# Create dataset with selected variables -df <- df1[, c(eohi_vars, cal_vars)] - -# Ensure all selected variables are numeric -df <- df %>% - mutate(across(everything(), as.numeric)) - -# Remove rows with any missing values for correlation analysis -df_complete <- df[complete.cases(df), ] - -cat("Sample size for correlation analysis:", nrow(df_complete), "\n") -cat("Total sample size:", nrow(df), "\n") - -str(df) -summary(df) -####==== DESCRIPTIVE STATISTICS ==== - -# Function to compute descriptive statistics -get_descriptives <- function(data, vars) { - desc_stats <- data %>% - select(all_of(vars)) %>% - summarise(across(everything(), list( - n = ~sum(!is.na(.)), - mean = ~mean(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - max = ~max(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - q25 = ~quantile(., 0.25, na.rm = TRUE), - q75 = ~quantile(., 0.75, na.rm = TRUE) - ))) %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - separate(variable, into = c("var", "stat"), sep = "_(?=[^_]+$)") %>% - pivot_wider(names_from = stat, values_from = value) %>% - mutate(across(c(mean, sd, min, max, median, q25, q75), ~round(., 5))) - - return(desc_stats) -} - -# Get descriptives for EOHI variables -eohi_descriptives <- get_descriptives(df, eohi_vars) -cat("\n=== EOHI Variables Descriptives ===\n") -print(eohi_descriptives) - -# Get descriptives for calibration variables -cal_descriptives <- get_descriptives(df, cal_vars) -cat("\n=== Calibration Variables Descriptives ===\n") -print(cal_descriptives) - -# Check for bimodal or unusual distributions -hist(df$eohi_pref) -hist(df$cal_selfActual) - -# Look for extreme values -# boxplot(df$eohi_pref) -# boxplot(df$cal_selfActual) - -####==== PEARSON CORRELATIONS ==== - -# Compute correlation matrix with p-values -cor_results_pearson <- rcorr(as.matrix(df_complete), type = "pearson") - -# Extract correlation coefficients -cor_pearson <- cor_results_pearson$r - -# Extract p-values -p_matrix_pearson <- cor_results_pearson$P - -# Function to add significance stars -corstars <- function(cor_mat, p_mat) { - stars <- ifelse(p_mat < 0.001, "***", - ifelse(p_mat < 0.01, "**", - ifelse(p_mat < 0.05, "*", ""))) - - # Combine correlation values with stars, rounded to 5 decimal places - cor_with_stars <- matrix(paste0(format(round(cor_mat, 5), nsmall = 5), stars), - nrow = nrow(cor_mat)) - - # Set row and column names - rownames(cor_with_stars) <- rownames(cor_mat) - colnames(cor_with_stars) <- colnames(cor_mat) - - return(cor_with_stars) -} - -# Apply the function -cor_table_pearson <- corstars(cor_pearson, p_matrix_pearson) - -cat("\n=== PEARSON CORRELATIONS ===\n") -print(cor_table_pearson, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations <- cor_pearson[eohi_vars, cal_vars] -eohi_cal_pvalues <- p_matrix_pearson[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Pearson Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations)) { - for(j in 1:ncol(eohi_cal_correlations)) { - cor_val <- eohi_cal_correlations[i, j] - p_val <- eohi_cal_pvalues[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: r = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations)[i], - colnames(eohi_cal_correlations)[j], - cor_val, star, p_val)) - } -} - -####==== SPEARMAN CORRELATIONS ==== - -# Compute Spearman correlation matrix with p-values -cor_results_spearman <- rcorr(as.matrix(df_complete), type = "spearman") - -# Extract correlation coefficients -cor_spearman <- cor_results_spearman$r - -# Extract p-values -p_matrix_spearman <- cor_results_spearman$P - -# Apply the function -cor_table_spearman <- corstars(cor_spearman, p_matrix_spearman) - -cat("\n=== SPEARMAN CORRELATIONS ===\n") -print(cor_table_spearman, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations_spearman <- cor_spearman[eohi_vars, cal_vars] -eohi_cal_pvalues_spearman <- p_matrix_spearman[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Spearman Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations_spearman)) { - for(j in 1:ncol(eohi_cal_correlations_spearman)) { - cor_val <- eohi_cal_correlations_spearman[i, j] - p_val <- eohi_cal_pvalues_spearman[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: rho = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations_spearman)[i], - colnames(eohi_cal_correlations_spearman)[j], - cor_val, star, p_val)) - } -} - -####==== BOOTSTRAPPED 95% CONFIDENCE INTERVALS ==== - -# Function to compute correlation with bootstrap CI -bootstrap_correlation <- function(data, var1, var2, method = "pearson", R = 1000) { - # Remove missing values - complete_data <- data[complete.cases(data[, c(var1, var2)]), ] - - if(nrow(complete_data) < 3) { - return(data.frame( - correlation = NA, - ci_lower = NA, - ci_upper = NA, - n = nrow(complete_data) - )) - } - - # Bootstrap function - boot_fun <- function(data, indices) { - cor(data[indices, var1], data[indices, var2], method = method, use = "complete.obs") - } - - # Perform bootstrap - set.seed(123) # for reproducibility - boot_results <- boot(complete_data, boot_fun, R = R) - - # Calculate confidence interval - ci <- boot.ci(boot_results, type = "perc") - - return(data.frame( - correlation = boot_results$t0, - ci_lower = ci$perc[4], - ci_upper = ci$perc[5], - n = nrow(complete_data) - )) -} - -# Compute bootstrap CIs for all EOHI x Calibration correlations -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (PEARSON) ===\n") -bootstrap_results_pearson <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "pearson", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "pearson" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_pearson) - -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (SPEARMAN) ===\n") -bootstrap_results_spearman <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "spearman", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "spearman" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_spearman) - -####==== SUMMARY TABLE ==== - -# Create comprehensive summary table -summary_table <- bootstrap_results_pearson %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper, n) %>% - rename(pearson_r = correlation, pearson_ci_lower = ci_lower, pearson_ci_upper = ci_upper) %>% - left_join( - bootstrap_results_spearman %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper) %>% - rename(spearman_rho = correlation, spearman_ci_lower = ci_lower, spearman_ci_upper = ci_upper), - by = c("eohi_var", "cal_var") - ) %>% - # Add p-values - left_join( - expand.grid(eohi_var = eohi_vars, cal_var = cal_vars, stringsAsFactors = FALSE) %>% - pmap_dfr(function(eohi_var, cal_var) { - pearson_p <- p_matrix_pearson[eohi_var, cal_var] - spearman_p <- p_matrix_spearman[eohi_var, cal_var] - data.frame( - eohi_var = eohi_var, - cal_var = cal_var, - pearson_p = pearson_p, - spearman_p = spearman_p - ) - }), - by = c("eohi_var", "cal_var") - ) %>% - mutate( - pearson_p = round(pearson_p, 5), - spearman_p = round(spearman_p, 5) - ) - -cat("\n=== COMPREHENSIVE SUMMARY TABLE ===\n") -print(summary_table) - -# Save results to CSV -# write.csv(summary_table, "eohi_calibration_correlations_summary.csv", row.names = FALSE) -# cat("\nResults saved to: eohi_calibration_correlations_summary.csv\n") diff --git a/.history/eohi1/correlations - eohi x calibration_20250916112614.r b/.history/eohi1/correlations - eohi x calibration_20250916112614.r deleted file mode 100644 index 1cf4f42..0000000 --- a/.history/eohi1/correlations - eohi x calibration_20250916112614.r +++ /dev/null @@ -1,303 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Remove columns with all NA values -df1 <- df1 %>% select(where(~ !all(is.na(.)))) - -# Select variables of interest -eohi_vars <- c("eohi_pref", "eohi_pers", "eohi_val", "eohi_life", "eohi_mean", - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean") -cal_vars <- c("cal_selfActual", "cal_global", "cal_15", "cal_35", "cal_55", "cal_75", "cal_true", "cal_false") - -# Create dataset with selected variables -df <- df1[, c(eohi_vars, cal_vars)] - -# Ensure all selected variables are numeric -df <- df %>% - mutate(across(everything(), as.numeric)) - -# Remove rows with any missing values for correlation analysis -df_complete <- df[complete.cases(df), ] - -cat("Sample size for correlation analysis:", nrow(df_complete), "\n") -cat("Total sample size:", nrow(df), "\n") - -str(df) -summary(df) -####==== DESCRIPTIVE STATISTICS ==== - -# Function to compute descriptive statistics -get_descriptives <- function(data, vars) { - desc_stats <- data %>% - select(all_of(vars)) %>% - summarise(across(everything(), list( - n = ~sum(!is.na(.)), - mean = ~mean(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - max = ~max(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - q25 = ~quantile(., 0.25, na.rm = TRUE), - q75 = ~quantile(., 0.75, na.rm = TRUE) - ))) %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - separate(variable, into = c("var", "stat"), sep = "_(?=[^_]+$)") %>% - pivot_wider(names_from = stat, values_from = value) %>% - mutate(across(c(mean, sd, min, max, median, q25, q75), ~round(., 5))) - - return(desc_stats) -} - -# Get descriptives for EOHI variables -eohi_descriptives <- get_descriptives(df, eohi_vars) -cat("\n=== EOHI Variables Descriptives ===\n") -print(eohi_descriptives) - -# Get descriptives for calibration variables -cal_descriptives <- get_descriptives(df, cal_vars) -cat("\n=== Calibration Variables Descriptives ===\n") -print(cal_descriptives) - -# Check for bimodal or unusual distributions -hist(df$eohi_pref) -hist(df$cal_selfActual) - -# Look for extreme values -# boxplot(df$eohi_pref) -# boxplot(df$cal_selfActual) - -# Test normality for each variable -library(nortest) - -# Test EOHI variables -for(var in eohi_vars) { - cat("\n", var, "normality test:\n") - print(shapiro.test(df_complete[[var]])) -} - -# Test calibration variables -for(var in cal_vars) { - cat("\n", var, "normality test:\n") - print(shapiro.test(df_complete[[var]])) -} - -####==== PEARSON CORRELATIONS ==== - -# Compute correlation matrix with p-values -cor_results_pearson <- rcorr(as.matrix(df_complete), type = "pearson") - -# Extract correlation coefficients -cor_pearson <- cor_results_pearson$r - -# Extract p-values -p_matrix_pearson <- cor_results_pearson$P - -# Function to add significance stars -corstars <- function(cor_mat, p_mat) { - stars <- ifelse(p_mat < 0.001, "***", - ifelse(p_mat < 0.01, "**", - ifelse(p_mat < 0.05, "*", ""))) - - # Combine correlation values with stars, rounded to 5 decimal places - cor_with_stars <- matrix(paste0(format(round(cor_mat, 5), nsmall = 5), stars), - nrow = nrow(cor_mat)) - - # Set row and column names - rownames(cor_with_stars) <- rownames(cor_mat) - colnames(cor_with_stars) <- colnames(cor_mat) - - return(cor_with_stars) -} - -# Apply the function -cor_table_pearson <- corstars(cor_pearson, p_matrix_pearson) - -cat("\n=== PEARSON CORRELATIONS ===\n") -print(cor_table_pearson, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations <- cor_pearson[eohi_vars, cal_vars] -eohi_cal_pvalues <- p_matrix_pearson[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Pearson Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations)) { - for(j in 1:ncol(eohi_cal_correlations)) { - cor_val <- eohi_cal_correlations[i, j] - p_val <- eohi_cal_pvalues[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: r = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations)[i], - colnames(eohi_cal_correlations)[j], - cor_val, star, p_val)) - } -} - -####==== SPEARMAN CORRELATIONS ==== - -# Compute Spearman correlation matrix with p-values -cor_results_spearman <- rcorr(as.matrix(df_complete), type = "spearman") - -# Extract correlation coefficients -cor_spearman <- cor_results_spearman$r - -# Extract p-values -p_matrix_spearman <- cor_results_spearman$P - -# Apply the function -cor_table_spearman <- corstars(cor_spearman, p_matrix_spearman) - -cat("\n=== SPEARMAN CORRELATIONS ===\n") -print(cor_table_spearman, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations_spearman <- cor_spearman[eohi_vars, cal_vars] -eohi_cal_pvalues_spearman <- p_matrix_spearman[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Spearman Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations_spearman)) { - for(j in 1:ncol(eohi_cal_correlations_spearman)) { - cor_val <- eohi_cal_correlations_spearman[i, j] - p_val <- eohi_cal_pvalues_spearman[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: rho = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations_spearman)[i], - colnames(eohi_cal_correlations_spearman)[j], - cor_val, star, p_val)) - } -} - -####==== BOOTSTRAPPED 95% CONFIDENCE INTERVALS ==== - -# Function to compute correlation with bootstrap CI -bootstrap_correlation <- function(data, var1, var2, method = "pearson", R = 1000) { - # Remove missing values - complete_data <- data[complete.cases(data[, c(var1, var2)]), ] - - if(nrow(complete_data) < 3) { - return(data.frame( - correlation = NA, - ci_lower = NA, - ci_upper = NA, - n = nrow(complete_data) - )) - } - - # Bootstrap function - boot_fun <- function(data, indices) { - cor(data[indices, var1], data[indices, var2], method = method, use = "complete.obs") - } - - # Perform bootstrap - set.seed(123) # for reproducibility - boot_results <- boot(complete_data, boot_fun, R = R) - - # Calculate confidence interval - ci <- boot.ci(boot_results, type = "perc") - - return(data.frame( - correlation = boot_results$t0, - ci_lower = ci$perc[4], - ci_upper = ci$perc[5], - n = nrow(complete_data) - )) -} - -# Compute bootstrap CIs for all EOHI x Calibration correlations -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (PEARSON) ===\n") -bootstrap_results_pearson <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "pearson", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "pearson" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_pearson) - -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (SPEARMAN) ===\n") -bootstrap_results_spearman <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "spearman", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "spearman" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_spearman) - -####==== SUMMARY TABLE ==== - -# Create comprehensive summary table -summary_table <- bootstrap_results_pearson %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper, n) %>% - rename(pearson_r = correlation, pearson_ci_lower = ci_lower, pearson_ci_upper = ci_upper) %>% - left_join( - bootstrap_results_spearman %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper) %>% - rename(spearman_rho = correlation, spearman_ci_lower = ci_lower, spearman_ci_upper = ci_upper), - by = c("eohi_var", "cal_var") - ) %>% - # Add p-values - left_join( - expand.grid(eohi_var = eohi_vars, cal_var = cal_vars, stringsAsFactors = FALSE) %>% - pmap_dfr(function(eohi_var, cal_var) { - pearson_p <- p_matrix_pearson[eohi_var, cal_var] - spearman_p <- p_matrix_spearman[eohi_var, cal_var] - data.frame( - eohi_var = eohi_var, - cal_var = cal_var, - pearson_p = pearson_p, - spearman_p = spearman_p - ) - }), - by = c("eohi_var", "cal_var") - ) %>% - mutate( - pearson_p = round(pearson_p, 5), - spearman_p = round(spearman_p, 5) - ) - -cat("\n=== COMPREHENSIVE SUMMARY TABLE ===\n") -print(summary_table) - -# Save results to CSV -# write.csv(summary_table, "eohi_calibration_correlations_summary.csv", row.names = FALSE) -# cat("\nResults saved to: eohi_calibration_correlations_summary.csv\n") diff --git a/.history/eohi1/correlations - eohi x calibration_20250916112923.r b/.history/eohi1/correlations - eohi x calibration_20250916112923.r deleted file mode 100644 index 1cf4f42..0000000 --- a/.history/eohi1/correlations - eohi x calibration_20250916112923.r +++ /dev/null @@ -1,303 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Remove columns with all NA values -df1 <- df1 %>% select(where(~ !all(is.na(.)))) - -# Select variables of interest -eohi_vars <- c("eohi_pref", "eohi_pers", "eohi_val", "eohi_life", "eohi_mean", - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean") -cal_vars <- c("cal_selfActual", "cal_global", "cal_15", "cal_35", "cal_55", "cal_75", "cal_true", "cal_false") - -# Create dataset with selected variables -df <- df1[, c(eohi_vars, cal_vars)] - -# Ensure all selected variables are numeric -df <- df %>% - mutate(across(everything(), as.numeric)) - -# Remove rows with any missing values for correlation analysis -df_complete <- df[complete.cases(df), ] - -cat("Sample size for correlation analysis:", nrow(df_complete), "\n") -cat("Total sample size:", nrow(df), "\n") - -str(df) -summary(df) -####==== DESCRIPTIVE STATISTICS ==== - -# Function to compute descriptive statistics -get_descriptives <- function(data, vars) { - desc_stats <- data %>% - select(all_of(vars)) %>% - summarise(across(everything(), list( - n = ~sum(!is.na(.)), - mean = ~mean(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - max = ~max(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - q25 = ~quantile(., 0.25, na.rm = TRUE), - q75 = ~quantile(., 0.75, na.rm = TRUE) - ))) %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - separate(variable, into = c("var", "stat"), sep = "_(?=[^_]+$)") %>% - pivot_wider(names_from = stat, values_from = value) %>% - mutate(across(c(mean, sd, min, max, median, q25, q75), ~round(., 5))) - - return(desc_stats) -} - -# Get descriptives for EOHI variables -eohi_descriptives <- get_descriptives(df, eohi_vars) -cat("\n=== EOHI Variables Descriptives ===\n") -print(eohi_descriptives) - -# Get descriptives for calibration variables -cal_descriptives <- get_descriptives(df, cal_vars) -cat("\n=== Calibration Variables Descriptives ===\n") -print(cal_descriptives) - -# Check for bimodal or unusual distributions -hist(df$eohi_pref) -hist(df$cal_selfActual) - -# Look for extreme values -# boxplot(df$eohi_pref) -# boxplot(df$cal_selfActual) - -# Test normality for each variable -library(nortest) - -# Test EOHI variables -for(var in eohi_vars) { - cat("\n", var, "normality test:\n") - print(shapiro.test(df_complete[[var]])) -} - -# Test calibration variables -for(var in cal_vars) { - cat("\n", var, "normality test:\n") - print(shapiro.test(df_complete[[var]])) -} - -####==== PEARSON CORRELATIONS ==== - -# Compute correlation matrix with p-values -cor_results_pearson <- rcorr(as.matrix(df_complete), type = "pearson") - -# Extract correlation coefficients -cor_pearson <- cor_results_pearson$r - -# Extract p-values -p_matrix_pearson <- cor_results_pearson$P - -# Function to add significance stars -corstars <- function(cor_mat, p_mat) { - stars <- ifelse(p_mat < 0.001, "***", - ifelse(p_mat < 0.01, "**", - ifelse(p_mat < 0.05, "*", ""))) - - # Combine correlation values with stars, rounded to 5 decimal places - cor_with_stars <- matrix(paste0(format(round(cor_mat, 5), nsmall = 5), stars), - nrow = nrow(cor_mat)) - - # Set row and column names - rownames(cor_with_stars) <- rownames(cor_mat) - colnames(cor_with_stars) <- colnames(cor_mat) - - return(cor_with_stars) -} - -# Apply the function -cor_table_pearson <- corstars(cor_pearson, p_matrix_pearson) - -cat("\n=== PEARSON CORRELATIONS ===\n") -print(cor_table_pearson, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations <- cor_pearson[eohi_vars, cal_vars] -eohi_cal_pvalues <- p_matrix_pearson[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Pearson Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations)) { - for(j in 1:ncol(eohi_cal_correlations)) { - cor_val <- eohi_cal_correlations[i, j] - p_val <- eohi_cal_pvalues[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: r = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations)[i], - colnames(eohi_cal_correlations)[j], - cor_val, star, p_val)) - } -} - -####==== SPEARMAN CORRELATIONS ==== - -# Compute Spearman correlation matrix with p-values -cor_results_spearman <- rcorr(as.matrix(df_complete), type = "spearman") - -# Extract correlation coefficients -cor_spearman <- cor_results_spearman$r - -# Extract p-values -p_matrix_spearman <- cor_results_spearman$P - -# Apply the function -cor_table_spearman <- corstars(cor_spearman, p_matrix_spearman) - -cat("\n=== SPEARMAN CORRELATIONS ===\n") -print(cor_table_spearman, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations_spearman <- cor_spearman[eohi_vars, cal_vars] -eohi_cal_pvalues_spearman <- p_matrix_spearman[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Spearman Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations_spearman)) { - for(j in 1:ncol(eohi_cal_correlations_spearman)) { - cor_val <- eohi_cal_correlations_spearman[i, j] - p_val <- eohi_cal_pvalues_spearman[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: rho = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations_spearman)[i], - colnames(eohi_cal_correlations_spearman)[j], - cor_val, star, p_val)) - } -} - -####==== BOOTSTRAPPED 95% CONFIDENCE INTERVALS ==== - -# Function to compute correlation with bootstrap CI -bootstrap_correlation <- function(data, var1, var2, method = "pearson", R = 1000) { - # Remove missing values - complete_data <- data[complete.cases(data[, c(var1, var2)]), ] - - if(nrow(complete_data) < 3) { - return(data.frame( - correlation = NA, - ci_lower = NA, - ci_upper = NA, - n = nrow(complete_data) - )) - } - - # Bootstrap function - boot_fun <- function(data, indices) { - cor(data[indices, var1], data[indices, var2], method = method, use = "complete.obs") - } - - # Perform bootstrap - set.seed(123) # for reproducibility - boot_results <- boot(complete_data, boot_fun, R = R) - - # Calculate confidence interval - ci <- boot.ci(boot_results, type = "perc") - - return(data.frame( - correlation = boot_results$t0, - ci_lower = ci$perc[4], - ci_upper = ci$perc[5], - n = nrow(complete_data) - )) -} - -# Compute bootstrap CIs for all EOHI x Calibration correlations -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (PEARSON) ===\n") -bootstrap_results_pearson <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "pearson", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "pearson" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_pearson) - -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (SPEARMAN) ===\n") -bootstrap_results_spearman <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "spearman", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "spearman" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_spearman) - -####==== SUMMARY TABLE ==== - -# Create comprehensive summary table -summary_table <- bootstrap_results_pearson %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper, n) %>% - rename(pearson_r = correlation, pearson_ci_lower = ci_lower, pearson_ci_upper = ci_upper) %>% - left_join( - bootstrap_results_spearman %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper) %>% - rename(spearman_rho = correlation, spearman_ci_lower = ci_lower, spearman_ci_upper = ci_upper), - by = c("eohi_var", "cal_var") - ) %>% - # Add p-values - left_join( - expand.grid(eohi_var = eohi_vars, cal_var = cal_vars, stringsAsFactors = FALSE) %>% - pmap_dfr(function(eohi_var, cal_var) { - pearson_p <- p_matrix_pearson[eohi_var, cal_var] - spearman_p <- p_matrix_spearman[eohi_var, cal_var] - data.frame( - eohi_var = eohi_var, - cal_var = cal_var, - pearson_p = pearson_p, - spearman_p = spearman_p - ) - }), - by = c("eohi_var", "cal_var") - ) %>% - mutate( - pearson_p = round(pearson_p, 5), - spearman_p = round(spearman_p, 5) - ) - -cat("\n=== COMPREHENSIVE SUMMARY TABLE ===\n") -print(summary_table) - -# Save results to CSV -# write.csv(summary_table, "eohi_calibration_correlations_summary.csv", row.names = FALSE) -# cat("\nResults saved to: eohi_calibration_correlations_summary.csv\n") diff --git a/.history/eohi1/correlations - eohi x calibration_20250916113008.r b/.history/eohi1/correlations - eohi x calibration_20250916113008.r deleted file mode 100644 index a99d62f..0000000 --- a/.history/eohi1/correlations - eohi x calibration_20250916113008.r +++ /dev/null @@ -1,303 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Remove columns with all NA values -df1 <- df1 %>% select(where(~ !all(is.na(.)))) - -# Select variables of interest -eohi_vars <- c("eohi_pref", "eohi_pers", "eohi_val", "eohi_life", "eohi_mean", - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean") -cal_vars <- c("cal_selfActual", "cal_global", "cal_15", "cal_35", "cal_55", "cal_75", "cal_true", "cal_false") - -# Create dataset with selected variables -df <- df1[, c(eohi_vars, cal_vars)] - -# Ensure all selected variables are numeric -df <- df %>% - mutate(across(everything(), as.numeric)) - -# Remove rows with any missing values for correlation analysis -df_complete <- df[complete.cases(df), ] - -cat("Sample size for correlation analysis:", nrow(df_complete), "\n") -cat("Total sample size:", nrow(df), "\n") - -str(df) -summary(df) -####==== DESCRIPTIVE STATISTICS ==== - -# Function to compute descriptive statistics -get_descriptives <- function(data, vars) { - desc_stats <- data %>% - select(all_of(vars)) %>% - summarise(across(everything(), list( - n = ~sum(!is.na(.)), - mean = ~mean(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - max = ~max(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - q25 = ~quantile(., 0.25, na.rm = TRUE), - q75 = ~quantile(., 0.75, na.rm = TRUE) - ))) %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - separate(variable, into = c("var", "stat"), sep = "_(?=[^_]+$)") %>% - pivot_wider(names_from = stat, values_from = value) %>% - mutate(across(c(mean, sd, min, max, median, q25, q75), ~round(., 5))) - - return(desc_stats) -} - -# Get descriptives for EOHI variables -eohi_descriptives <- get_descriptives(df, eohi_vars) -cat("\n=== EOHI Variables Descriptives ===\n") -print(eohi_descriptives) - -# Get descriptives for calibration variables -cal_descriptives <- get_descriptives(df, cal_vars) -cat("\n=== Calibration Variables Descriptives ===\n") -print(cal_descriptives) - -# Check for bimodal or unusual distributions -hist(df$eohi_pref) -hist(df$cal_selfActual) - -# Look for extreme values -# boxplot(df$eohi_pref) -# boxplot(df$cal_selfActual) - -# Test normality for each variable - probably unnecessary -library(nortest) - -# Test EOHI variables -for(var in eohi_vars) { - cat("\n", var, "normality test:\n") - print(shapiro.test(df_complete[[var]])) -} - -# Test calibration variables -for(var in cal_vars) { - cat("\n", var, "normality test:\n") - print(shapiro.test(df_complete[[var]])) -} - -####==== PEARSON CORRELATIONS ==== - -# Compute correlation matrix with p-values -cor_results_pearson <- rcorr(as.matrix(df_complete), type = "pearson") - -# Extract correlation coefficients -cor_pearson <- cor_results_pearson$r - -# Extract p-values -p_matrix_pearson <- cor_results_pearson$P - -# Function to add significance stars -corstars <- function(cor_mat, p_mat) { - stars <- ifelse(p_mat < 0.001, "***", - ifelse(p_mat < 0.01, "**", - ifelse(p_mat < 0.05, "*", ""))) - - # Combine correlation values with stars, rounded to 5 decimal places - cor_with_stars <- matrix(paste0(format(round(cor_mat, 5), nsmall = 5), stars), - nrow = nrow(cor_mat)) - - # Set row and column names - rownames(cor_with_stars) <- rownames(cor_mat) - colnames(cor_with_stars) <- colnames(cor_mat) - - return(cor_with_stars) -} - -# Apply the function -cor_table_pearson <- corstars(cor_pearson, p_matrix_pearson) - -cat("\n=== PEARSON CORRELATIONS ===\n") -print(cor_table_pearson, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations <- cor_pearson[eohi_vars, cal_vars] -eohi_cal_pvalues <- p_matrix_pearson[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Pearson Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations)) { - for(j in 1:ncol(eohi_cal_correlations)) { - cor_val <- eohi_cal_correlations[i, j] - p_val <- eohi_cal_pvalues[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: r = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations)[i], - colnames(eohi_cal_correlations)[j], - cor_val, star, p_val)) - } -} - -####==== SPEARMAN CORRELATIONS ==== - -# Compute Spearman correlation matrix with p-values -cor_results_spearman <- rcorr(as.matrix(df_complete), type = "spearman") - -# Extract correlation coefficients -cor_spearman <- cor_results_spearman$r - -# Extract p-values -p_matrix_spearman <- cor_results_spearman$P - -# Apply the function -cor_table_spearman <- corstars(cor_spearman, p_matrix_spearman) - -cat("\n=== SPEARMAN CORRELATIONS ===\n") -print(cor_table_spearman, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations_spearman <- cor_spearman[eohi_vars, cal_vars] -eohi_cal_pvalues_spearman <- p_matrix_spearman[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Spearman Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations_spearman)) { - for(j in 1:ncol(eohi_cal_correlations_spearman)) { - cor_val <- eohi_cal_correlations_spearman[i, j] - p_val <- eohi_cal_pvalues_spearman[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: rho = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations_spearman)[i], - colnames(eohi_cal_correlations_spearman)[j], - cor_val, star, p_val)) - } -} - -####==== BOOTSTRAPPED 95% CONFIDENCE INTERVALS ==== - -# Function to compute correlation with bootstrap CI -bootstrap_correlation <- function(data, var1, var2, method = "pearson", R = 1000) { - # Remove missing values - complete_data <- data[complete.cases(data[, c(var1, var2)]), ] - - if(nrow(complete_data) < 3) { - return(data.frame( - correlation = NA, - ci_lower = NA, - ci_upper = NA, - n = nrow(complete_data) - )) - } - - # Bootstrap function - boot_fun <- function(data, indices) { - cor(data[indices, var1], data[indices, var2], method = method, use = "complete.obs") - } - - # Perform bootstrap - set.seed(123) # for reproducibility - boot_results <- boot(complete_data, boot_fun, R = R) - - # Calculate confidence interval - ci <- boot.ci(boot_results, type = "perc") - - return(data.frame( - correlation = boot_results$t0, - ci_lower = ci$perc[4], - ci_upper = ci$perc[5], - n = nrow(complete_data) - )) -} - -# Compute bootstrap CIs for all EOHI x Calibration correlations -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (PEARSON) ===\n") -bootstrap_results_pearson <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "pearson", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "pearson" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_pearson) - -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (SPEARMAN) ===\n") -bootstrap_results_spearman <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "spearman", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "spearman" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_spearman) - -####==== SUMMARY TABLE ==== - -# Create comprehensive summary table -summary_table <- bootstrap_results_pearson %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper, n) %>% - rename(pearson_r = correlation, pearson_ci_lower = ci_lower, pearson_ci_upper = ci_upper) %>% - left_join( - bootstrap_results_spearman %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper) %>% - rename(spearman_rho = correlation, spearman_ci_lower = ci_lower, spearman_ci_upper = ci_upper), - by = c("eohi_var", "cal_var") - ) %>% - # Add p-values - left_join( - expand.grid(eohi_var = eohi_vars, cal_var = cal_vars, stringsAsFactors = FALSE) %>% - pmap_dfr(function(eohi_var, cal_var) { - pearson_p <- p_matrix_pearson[eohi_var, cal_var] - spearman_p <- p_matrix_spearman[eohi_var, cal_var] - data.frame( - eohi_var = eohi_var, - cal_var = cal_var, - pearson_p = pearson_p, - spearman_p = spearman_p - ) - }), - by = c("eohi_var", "cal_var") - ) %>% - mutate( - pearson_p = round(pearson_p, 5), - spearman_p = round(spearman_p, 5) - ) - -cat("\n=== COMPREHENSIVE SUMMARY TABLE ===\n") -print(summary_table) - -# Save results to CSV -# write.csv(summary_table, "eohi_calibration_correlations_summary.csv", row.names = FALSE) -# cat("\nResults saved to: eohi_calibration_correlations_summary.csv\n") diff --git a/.history/eohi1/correlations - eohi x calibration_20250929153154.r b/.history/eohi1/correlations - eohi x calibration_20250929153154.r deleted file mode 100644 index a99d62f..0000000 --- a/.history/eohi1/correlations - eohi x calibration_20250929153154.r +++ /dev/null @@ -1,303 +0,0 @@ -# Load required libraries -library(Hmisc) -library(knitr) -library(dplyr) -library(corrr) -library(broom) -library(purrr) -library(tidyr) -library(tibble) -library(boot) - -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -# Load data -df1 <- read.csv("exp1.csv") - -# Remove columns with all NA values -df1 <- df1 %>% select(where(~ !all(is.na(.)))) - -# Select variables of interest -eohi_vars <- c("eohi_pref", "eohi_pers", "eohi_val", "eohi_life", "eohi_mean", - "eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean") -cal_vars <- c("cal_selfActual", "cal_global", "cal_15", "cal_35", "cal_55", "cal_75", "cal_true", "cal_false") - -# Create dataset with selected variables -df <- df1[, c(eohi_vars, cal_vars)] - -# Ensure all selected variables are numeric -df <- df %>% - mutate(across(everything(), as.numeric)) - -# Remove rows with any missing values for correlation analysis -df_complete <- df[complete.cases(df), ] - -cat("Sample size for correlation analysis:", nrow(df_complete), "\n") -cat("Total sample size:", nrow(df), "\n") - -str(df) -summary(df) -####==== DESCRIPTIVE STATISTICS ==== - -# Function to compute descriptive statistics -get_descriptives <- function(data, vars) { - desc_stats <- data %>% - select(all_of(vars)) %>% - summarise(across(everything(), list( - n = ~sum(!is.na(.)), - mean = ~mean(., na.rm = TRUE), - sd = ~sd(., na.rm = TRUE), - min = ~min(., na.rm = TRUE), - max = ~max(., na.rm = TRUE), - median = ~median(., na.rm = TRUE), - q25 = ~quantile(., 0.25, na.rm = TRUE), - q75 = ~quantile(., 0.75, na.rm = TRUE) - ))) %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - separate(variable, into = c("var", "stat"), sep = "_(?=[^_]+$)") %>% - pivot_wider(names_from = stat, values_from = value) %>% - mutate(across(c(mean, sd, min, max, median, q25, q75), ~round(., 5))) - - return(desc_stats) -} - -# Get descriptives for EOHI variables -eohi_descriptives <- get_descriptives(df, eohi_vars) -cat("\n=== EOHI Variables Descriptives ===\n") -print(eohi_descriptives) - -# Get descriptives for calibration variables -cal_descriptives <- get_descriptives(df, cal_vars) -cat("\n=== Calibration Variables Descriptives ===\n") -print(cal_descriptives) - -# Check for bimodal or unusual distributions -hist(df$eohi_pref) -hist(df$cal_selfActual) - -# Look for extreme values -# boxplot(df$eohi_pref) -# boxplot(df$cal_selfActual) - -# Test normality for each variable - probably unnecessary -library(nortest) - -# Test EOHI variables -for(var in eohi_vars) { - cat("\n", var, "normality test:\n") - print(shapiro.test(df_complete[[var]])) -} - -# Test calibration variables -for(var in cal_vars) { - cat("\n", var, "normality test:\n") - print(shapiro.test(df_complete[[var]])) -} - -####==== PEARSON CORRELATIONS ==== - -# Compute correlation matrix with p-values -cor_results_pearson <- rcorr(as.matrix(df_complete), type = "pearson") - -# Extract correlation coefficients -cor_pearson <- cor_results_pearson$r - -# Extract p-values -p_matrix_pearson <- cor_results_pearson$P - -# Function to add significance stars -corstars <- function(cor_mat, p_mat) { - stars <- ifelse(p_mat < 0.001, "***", - ifelse(p_mat < 0.01, "**", - ifelse(p_mat < 0.05, "*", ""))) - - # Combine correlation values with stars, rounded to 5 decimal places - cor_with_stars <- matrix(paste0(format(round(cor_mat, 5), nsmall = 5), stars), - nrow = nrow(cor_mat)) - - # Set row and column names - rownames(cor_with_stars) <- rownames(cor_mat) - colnames(cor_with_stars) <- colnames(cor_mat) - - return(cor_with_stars) -} - -# Apply the function -cor_table_pearson <- corstars(cor_pearson, p_matrix_pearson) - -cat("\n=== PEARSON CORRELATIONS ===\n") -print(cor_table_pearson, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations <- cor_pearson[eohi_vars, cal_vars] -eohi_cal_pvalues <- p_matrix_pearson[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Pearson Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations)) { - for(j in 1:ncol(eohi_cal_correlations)) { - cor_val <- eohi_cal_correlations[i, j] - p_val <- eohi_cal_pvalues[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: r = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations)[i], - colnames(eohi_cal_correlations)[j], - cor_val, star, p_val)) - } -} - -####==== SPEARMAN CORRELATIONS ==== - -# Compute Spearman correlation matrix with p-values -cor_results_spearman <- rcorr(as.matrix(df_complete), type = "spearman") - -# Extract correlation coefficients -cor_spearman <- cor_results_spearman$r - -# Extract p-values -p_matrix_spearman <- cor_results_spearman$P - -# Apply the function -cor_table_spearman <- corstars(cor_spearman, p_matrix_spearman) - -cat("\n=== SPEARMAN CORRELATIONS ===\n") -print(cor_table_spearman, quote = FALSE) - -# Extract specific correlations between EOHI and calibration variables -eohi_cal_correlations_spearman <- cor_spearman[eohi_vars, cal_vars] -eohi_cal_pvalues_spearman <- p_matrix_spearman[eohi_vars, cal_vars] - -cat("\n=== EOHI x Calibration Spearman Correlations ===\n") -for(i in 1:nrow(eohi_cal_correlations_spearman)) { - for(j in 1:ncol(eohi_cal_correlations_spearman)) { - cor_val <- eohi_cal_correlations_spearman[i, j] - p_val <- eohi_cal_pvalues_spearman[i, j] - star <- ifelse(p_val < 0.001, "***", - ifelse(p_val < 0.01, "**", - ifelse(p_val < 0.05, "*", ""))) - cat(sprintf("%s x %s: rho = %.5f%s, p = %.5f\n", - rownames(eohi_cal_correlations_spearman)[i], - colnames(eohi_cal_correlations_spearman)[j], - cor_val, star, p_val)) - } -} - -####==== BOOTSTRAPPED 95% CONFIDENCE INTERVALS ==== - -# Function to compute correlation with bootstrap CI -bootstrap_correlation <- function(data, var1, var2, method = "pearson", R = 1000) { - # Remove missing values - complete_data <- data[complete.cases(data[, c(var1, var2)]), ] - - if(nrow(complete_data) < 3) { - return(data.frame( - correlation = NA, - ci_lower = NA, - ci_upper = NA, - n = nrow(complete_data) - )) - } - - # Bootstrap function - boot_fun <- function(data, indices) { - cor(data[indices, var1], data[indices, var2], method = method, use = "complete.obs") - } - - # Perform bootstrap - set.seed(123) # for reproducibility - boot_results <- boot(complete_data, boot_fun, R = R) - - # Calculate confidence interval - ci <- boot.ci(boot_results, type = "perc") - - return(data.frame( - correlation = boot_results$t0, - ci_lower = ci$perc[4], - ci_upper = ci$perc[5], - n = nrow(complete_data) - )) -} - -# Compute bootstrap CIs for all EOHI x Calibration correlations -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (PEARSON) ===\n") -bootstrap_results_pearson <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "pearson", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "pearson" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_pearson) - -cat("\n=== BOOTSTRAPPED 95% CONFIDENCE INTERVALS (SPEARMAN) ===\n") -bootstrap_results_spearman <- expand.grid( - eohi_var = eohi_vars, - cal_var = cal_vars, - stringsAsFactors = FALSE -) %>% - pmap_dfr(function(eohi_var, cal_var) { - result <- bootstrap_correlation(df, eohi_var, cal_var, method = "spearman", R = 1000) - result$eohi_var <- eohi_var - result$cal_var <- cal_var - result$method <- "spearman" - return(result) - }) %>% - mutate( - correlation = round(correlation, 5), - ci_lower = round(ci_lower, 5), - ci_upper = round(ci_upper, 5) - ) - -print(bootstrap_results_spearman) - -####==== SUMMARY TABLE ==== - -# Create comprehensive summary table -summary_table <- bootstrap_results_pearson %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper, n) %>% - rename(pearson_r = correlation, pearson_ci_lower = ci_lower, pearson_ci_upper = ci_upper) %>% - left_join( - bootstrap_results_spearman %>% - select(eohi_var, cal_var, correlation, ci_lower, ci_upper) %>% - rename(spearman_rho = correlation, spearman_ci_lower = ci_lower, spearman_ci_upper = ci_upper), - by = c("eohi_var", "cal_var") - ) %>% - # Add p-values - left_join( - expand.grid(eohi_var = eohi_vars, cal_var = cal_vars, stringsAsFactors = FALSE) %>% - pmap_dfr(function(eohi_var, cal_var) { - pearson_p <- p_matrix_pearson[eohi_var, cal_var] - spearman_p <- p_matrix_spearman[eohi_var, cal_var] - data.frame( - eohi_var = eohi_var, - cal_var = cal_var, - pearson_p = pearson_p, - spearman_p = spearman_p - ) - }), - by = c("eohi_var", "cal_var") - ) %>% - mutate( - pearson_p = round(pearson_p, 5), - spearman_p = round(spearman_p, 5) - ) - -cat("\n=== COMPREHENSIVE SUMMARY TABLE ===\n") -print(summary_table) - -# Save results to CSV -# write.csv(summary_table, "eohi_calibration_correlations_summary.csv", row.names = FALSE) -# cat("\nResults saved to: eohi_calibration_correlations_summary.csv\n") diff --git a/.history/eohi1/correlations - scales_20251007231341.r b/.history/eohi1/correlations - scales_20251007231341.r deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi1/correlations - scales_20251007232519.r b/.history/eohi1/correlations - scales_20251007232519.r deleted file mode 100644 index 4923146..0000000 --- a/.history/eohi1/correlations - scales_20251007232519.r +++ /dev/null @@ -1,4 +0,0 @@ - - -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007232812.r b/.history/eohi1/correlations - scales_20251007232812.r deleted file mode 100644 index 08bcbc3..0000000 --- a/.history/eohi1/correlations - scales_20251007232812.r +++ /dev/null @@ -1,2 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007233020.r b/.history/eohi1/correlations - scales_20251007233020.r deleted file mode 100644 index 2b573bf..0000000 --- a/.history/eohi1/correlations - scales_20251007233020.r +++ /dev/null @@ -1,74 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# Remove rows with any missing values -correlation_data <- na.omit(correlation_data) - -cat("Sample size for correlations:", nrow(correlation_data), "\n\n") - -# Calculate correlation matrix -cor_matrix <- cor(correlation_data, use = "complete.obs") - -# Print correlation matrix with 5 decimal places -cat("Correlation Matrix:\n") -print(round(cor_matrix, 5)) - -# Separate correlations between the two sets -set1_set2_cor <- cor_matrix[set1_vars, set2_vars] - -cat("\nCorrelations between Set 1 (EOHI/DGEN) and Set 2 (Cognitive):\n") -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set -set1_within_cor <- cor_matrix[set1_vars, set1_vars] -set2_within_cor <- cor_matrix[set2_vars, set2_vars] - -cat("\nWithin Set 1 correlations (EOHI/DGEN):\n") -print(round(set1_within_cor, 5)) - -cat("\nWithin Set 2 correlations (Cognitive):\n") -print(round(set2_within_cor, 5)) - -# Statistical significance tests -cat("\nStatistical significance tests (p-values):\n") -cor_test_results <- rcorr(as.matrix(correlation_data)) - -cat("\nP-values for Set 1 vs Set 2 correlations:\n") -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot -pdf("correlation_plot_scales.pdf", width = 10, height = 8) -corrplot(cor_matrix, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -cat("\nDescriptive Statistics:\n") -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -cat("\nAnalysis completed. Correlation plot saved as 'correlation_plot_scales.pdf'\n") \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007233026.r b/.history/eohi1/correlations - scales_20251007233026.r deleted file mode 100644 index 2b573bf..0000000 --- a/.history/eohi1/correlations - scales_20251007233026.r +++ /dev/null @@ -1,74 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# Remove rows with any missing values -correlation_data <- na.omit(correlation_data) - -cat("Sample size for correlations:", nrow(correlation_data), "\n\n") - -# Calculate correlation matrix -cor_matrix <- cor(correlation_data, use = "complete.obs") - -# Print correlation matrix with 5 decimal places -cat("Correlation Matrix:\n") -print(round(cor_matrix, 5)) - -# Separate correlations between the two sets -set1_set2_cor <- cor_matrix[set1_vars, set2_vars] - -cat("\nCorrelations between Set 1 (EOHI/DGEN) and Set 2 (Cognitive):\n") -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set -set1_within_cor <- cor_matrix[set1_vars, set1_vars] -set2_within_cor <- cor_matrix[set2_vars, set2_vars] - -cat("\nWithin Set 1 correlations (EOHI/DGEN):\n") -print(round(set1_within_cor, 5)) - -cat("\nWithin Set 2 correlations (Cognitive):\n") -print(round(set2_within_cor, 5)) - -# Statistical significance tests -cat("\nStatistical significance tests (p-values):\n") -cor_test_results <- rcorr(as.matrix(correlation_data)) - -cat("\nP-values for Set 1 vs Set 2 correlations:\n") -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot -pdf("correlation_plot_scales.pdf", width = 10, height = 8) -corrplot(cor_matrix, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -cat("\nDescriptive Statistics:\n") -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -cat("\nAnalysis completed. Correlation plot saved as 'correlation_plot_scales.pdf'\n") \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007233050.r b/.history/eohi1/correlations - scales_20251007233050.r deleted file mode 100644 index 213f481..0000000 --- a/.history/eohi1/correlations - scales_20251007233050.r +++ /dev/null @@ -1,71 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -cat("Sample size for correlations:", nrow(correlation_data), "\n\n") - -# Calculate correlation matrix -cor_matrix <- cor(correlation_data) - -# Print correlation matrix with 5 decimal places -cat("Correlation Matrix:\n") -print(round(cor_matrix, 5)) - -# Separate correlations between the two sets -set1_set2_cor <- cor_matrix[set1_vars, set2_vars] - -cat("\nCorrelations between Set 1 (EOHI/DGEN) and Set 2 (Cognitive):\n") -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set -set1_within_cor <- cor_matrix[set1_vars, set1_vars] -set2_within_cor <- cor_matrix[set2_vars, set2_vars] - -cat("\nWithin Set 1 correlations (EOHI/DGEN):\n") -print(round(set1_within_cor, 5)) - -cat("\nWithin Set 2 correlations (Cognitive):\n") -print(round(set2_within_cor, 5)) - -# Statistical significance tests -cat("\nStatistical significance tests (p-values):\n") -cor_test_results <- rcorr(as.matrix(correlation_data)) - -cat("\nP-values for Set 1 vs Set 2 correlations:\n") -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot -pdf("correlation_plot_scales.pdf", width = 10, height = 8) -corrplot(cor_matrix, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -cat("\nDescriptive Statistics:\n") -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -cat("\nAnalysis completed. Correlation plot saved as 'correlation_plot_scales.pdf'\n") \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007233059.r b/.history/eohi1/correlations - scales_20251007233059.r deleted file mode 100644 index 213f481..0000000 --- a/.history/eohi1/correlations - scales_20251007233059.r +++ /dev/null @@ -1,71 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -cat("Sample size for correlations:", nrow(correlation_data), "\n\n") - -# Calculate correlation matrix -cor_matrix <- cor(correlation_data) - -# Print correlation matrix with 5 decimal places -cat("Correlation Matrix:\n") -print(round(cor_matrix, 5)) - -# Separate correlations between the two sets -set1_set2_cor <- cor_matrix[set1_vars, set2_vars] - -cat("\nCorrelations between Set 1 (EOHI/DGEN) and Set 2 (Cognitive):\n") -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set -set1_within_cor <- cor_matrix[set1_vars, set1_vars] -set2_within_cor <- cor_matrix[set2_vars, set2_vars] - -cat("\nWithin Set 1 correlations (EOHI/DGEN):\n") -print(round(set1_within_cor, 5)) - -cat("\nWithin Set 2 correlations (Cognitive):\n") -print(round(set2_within_cor, 5)) - -# Statistical significance tests -cat("\nStatistical significance tests (p-values):\n") -cor_test_results <- rcorr(as.matrix(correlation_data)) - -cat("\nP-values for Set 1 vs Set 2 correlations:\n") -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot -pdf("correlation_plot_scales.pdf", width = 10, height = 8) -corrplot(cor_matrix, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -cat("\nDescriptive Statistics:\n") -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -cat("\nAnalysis completed. Correlation plot saved as 'correlation_plot_scales.pdf'\n") \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007233106.r b/.history/eohi1/correlations - scales_20251007233106.r deleted file mode 100644 index 213f481..0000000 --- a/.history/eohi1/correlations - scales_20251007233106.r +++ /dev/null @@ -1,71 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -cat("Sample size for correlations:", nrow(correlation_data), "\n\n") - -# Calculate correlation matrix -cor_matrix <- cor(correlation_data) - -# Print correlation matrix with 5 decimal places -cat("Correlation Matrix:\n") -print(round(cor_matrix, 5)) - -# Separate correlations between the two sets -set1_set2_cor <- cor_matrix[set1_vars, set2_vars] - -cat("\nCorrelations between Set 1 (EOHI/DGEN) and Set 2 (Cognitive):\n") -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set -set1_within_cor <- cor_matrix[set1_vars, set1_vars] -set2_within_cor <- cor_matrix[set2_vars, set2_vars] - -cat("\nWithin Set 1 correlations (EOHI/DGEN):\n") -print(round(set1_within_cor, 5)) - -cat("\nWithin Set 2 correlations (Cognitive):\n") -print(round(set2_within_cor, 5)) - -# Statistical significance tests -cat("\nStatistical significance tests (p-values):\n") -cor_test_results <- rcorr(as.matrix(correlation_data)) - -cat("\nP-values for Set 1 vs Set 2 correlations:\n") -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot -pdf("correlation_plot_scales.pdf", width = 10, height = 8) -corrplot(cor_matrix, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -cat("\nDescriptive Statistics:\n") -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -cat("\nAnalysis completed. Correlation plot saved as 'correlation_plot_scales.pdf'\n") \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007233151.r b/.history/eohi1/correlations - scales_20251007233151.r deleted file mode 100644 index 7691c31..0000000 --- a/.history/eohi1/correlations - scales_20251007233151.r +++ /dev/null @@ -1,72 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -cat("Sample size for correlations:", nrow(correlation_data), "\n\n") - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Print correlation matrix with 5 decimal places -cat("Correlation Matrix:\n") -print(round(cor_matrix, 5)) - -# Separate correlations between the two sets -set1_set2_cor <- cor_matrix[set1_vars, set2_vars] - -cat("\nCorrelations between Set 1 (EOHI/DGEN) and Set 2 (Cognitive):\n") -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set -set1_within_cor <- cor_matrix[set1_vars, set1_vars] -set2_within_cor <- cor_matrix[set2_vars, set2_vars] - -cat("\nWithin Set 1 correlations (EOHI/DGEN):\n") -print(round(set1_within_cor, 5)) - -cat("\nWithin Set 2 correlations (Cognitive):\n") -print(round(set2_within_cor, 5)) - -# Statistical significance tests -cat("\nStatistical significance tests (p-values):\n") -cor_test_results <- rcorr(as.matrix(correlation_data)) - -cat("\nP-values for Set 1 vs Set 2 correlations:\n") -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot -pdf("correlation_plot_scales.pdf", width = 10, height = 8) -corrplot(cor_matrix, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -cat("\nDescriptive Statistics:\n") -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -cat("\nAnalysis completed. Correlation plot saved as 'correlation_plot_scales.pdf'\n") \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007233159.r b/.history/eohi1/correlations - scales_20251007233159.r deleted file mode 100644 index 40a2b40..0000000 --- a/.history/eohi1/correlations - scales_20251007233159.r +++ /dev/null @@ -1,80 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -cat("Sample size for correlations:", nrow(correlation_data), "\n\n") - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method (more robust for cognitive data) -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -cat("=== SPEARMAN CORRELATIONS (Primary Analysis) ===\n") -cat("Correlation Matrix:\n") -print(round(cor_matrix_spearman, 5)) - -cat("\n=== PEARSON CORRELATIONS (Comparison) ===\n") -cat("Correlation Matrix:\n") -print(round(cor_matrix_pearson, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] - -cat("\n=== SPEARMAN: Correlations between Set 1 (EOHI/DGEN) and Set 2 (Cognitive) ===\n") -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -cat("\nWithin Set 1 correlations (EOHI/DGEN):\n") -print(round(set1_within_cor, 5)) - -cat("\nWithin Set 2 correlations (Cognitive):\n") -print(round(set2_within_cor, 5)) - -# Statistical significance tests -cat("\nStatistical significance tests (p-values):\n") -cor_test_results <- rcorr(as.matrix(correlation_data)) - -cat("\nP-values for Set 1 vs Set 2 correlations:\n") -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot -pdf("correlation_plot_scales.pdf", width = 10, height = 8) -corrplot(cor_matrix, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -cat("\nDescriptive Statistics:\n") -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -cat("\nAnalysis completed. Correlation plot saved as 'correlation_plot_scales.pdf'\n") \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007233210.r b/.history/eohi1/correlations - scales_20251007233210.r deleted file mode 100644 index 96e6a92..0000000 --- a/.history/eohi1/correlations - scales_20251007233210.r +++ /dev/null @@ -1,87 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -cat("Sample size for correlations:", nrow(correlation_data), "\n\n") - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method (more robust for cognitive data) -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -cat("=== SPEARMAN CORRELATIONS (Primary Analysis) ===\n") -cat("Correlation Matrix:\n") -print(round(cor_matrix_spearman, 5)) - -cat("\n=== PEARSON CORRELATIONS (Comparison) ===\n") -cat("Correlation Matrix:\n") -print(round(cor_matrix_pearson, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] - -cat("\n=== SPEARMAN: Correlations between Set 1 (EOHI/DGEN) and Set 2 (Cognitive) ===\n") -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -cat("\nWithin Set 1 correlations (EOHI/DGEN):\n") -print(round(set1_within_cor, 5)) - -cat("\nWithin Set 2 correlations (Cognitive):\n") -print(round(set2_within_cor, 5)) - -# Statistical significance tests (Spearman) -cat("\n=== SPEARMAN: Statistical significance tests (p-values) ===\n") -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -cat("\nP-values for Set 1 vs Set 2 correlations (Spearman):\n") -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -cat("\nDescriptive Statistics:\n") -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -cat("\nAnalysis completed. Correlation plot saved as 'correlation_plot_scales.pdf'\n") \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007233219.r b/.history/eohi1/correlations - scales_20251007233219.r deleted file mode 100644 index 2731c68..0000000 --- a/.history/eohi1/correlations - scales_20251007233219.r +++ /dev/null @@ -1,95 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -cat("Sample size for correlations:", nrow(correlation_data), "\n\n") - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method (more robust for cognitive data) -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -cat("=== SPEARMAN CORRELATIONS (Primary Analysis) ===\n") -cat("Correlation Matrix:\n") -print(round(cor_matrix_spearman, 5)) - -cat("\n=== PEARSON CORRELATIONS (Comparison) ===\n") -cat("Correlation Matrix:\n") -print(round(cor_matrix_pearson, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] - -cat("\n=== SPEARMAN: Correlations between Set 1 (EOHI/DGEN) and Set 2 (Cognitive) ===\n") -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -cat("\nWithin Set 1 correlations (EOHI/DGEN):\n") -print(round(set1_within_cor, 5)) - -cat("\nWithin Set 2 correlations (Cognitive):\n") -print(round(set2_within_cor, 5)) - -# Statistical significance tests (Spearman) -cat("\n=== SPEARMAN: Statistical significance tests (p-values) ===\n") -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -cat("\nP-values for Set 1 vs Set 2 correlations (Spearman):\n") -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -cat("\nDescriptive Statistics:\n") -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -cat("\n=== ANALYSIS COMPLETED ===\n") -cat("Spearman correlations are recommended for this analysis due to:\n") -cat("- Non-parametric nature (no distribution assumptions)\n") -cat("- Robustness to outliers and non-linear relationships\n") -cat("- Better suitability for cognitive measures\n\n") -cat("Output files created:\n") -cat("- correlation_plot_scales_spearman.pdf (primary analysis)\n") -cat("- correlation_plot_scales_pearson.pdf (comparison)\n") -cat("\nSpearman correlations are reported as the primary analysis.\n") \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007233224.r b/.history/eohi1/correlations - scales_20251007233224.r deleted file mode 100644 index 2731c68..0000000 --- a/.history/eohi1/correlations - scales_20251007233224.r +++ /dev/null @@ -1,95 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -cat("Sample size for correlations:", nrow(correlation_data), "\n\n") - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method (more robust for cognitive data) -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -cat("=== SPEARMAN CORRELATIONS (Primary Analysis) ===\n") -cat("Correlation Matrix:\n") -print(round(cor_matrix_spearman, 5)) - -cat("\n=== PEARSON CORRELATIONS (Comparison) ===\n") -cat("Correlation Matrix:\n") -print(round(cor_matrix_pearson, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] - -cat("\n=== SPEARMAN: Correlations between Set 1 (EOHI/DGEN) and Set 2 (Cognitive) ===\n") -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -cat("\nWithin Set 1 correlations (EOHI/DGEN):\n") -print(round(set1_within_cor, 5)) - -cat("\nWithin Set 2 correlations (Cognitive):\n") -print(round(set2_within_cor, 5)) - -# Statistical significance tests (Spearman) -cat("\n=== SPEARMAN: Statistical significance tests (p-values) ===\n") -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -cat("\nP-values for Set 1 vs Set 2 correlations (Spearman):\n") -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -cat("\nDescriptive Statistics:\n") -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -cat("\n=== ANALYSIS COMPLETED ===\n") -cat("Spearman correlations are recommended for this analysis due to:\n") -cat("- Non-parametric nature (no distribution assumptions)\n") -cat("- Robustness to outliers and non-linear relationships\n") -cat("- Better suitability for cognitive measures\n\n") -cat("Output files created:\n") -cat("- correlation_plot_scales_spearman.pdf (primary analysis)\n") -cat("- correlation_plot_scales_pearson.pdf (comparison)\n") -cat("\nSpearman correlations are reported as the primary analysis.\n") \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007233307.r b/.history/eohi1/correlations - scales_20251007233307.r deleted file mode 100644 index 7c17c5e..0000000 --- a/.history/eohi1/correlations - scales_20251007233307.r +++ /dev/null @@ -1,77 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -cat("\nDescriptive Statistics:\n") -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -cat("\n=== ANALYSIS COMPLETED ===\n") -cat("Spearman correlations are recommended for this analysis due to:\n") -cat("- Non-parametric nature (no distribution assumptions)\n") -cat("- Robustness to outliers and non-linear relationships\n") -cat("- Better suitability for cognitive measures\n\n") -cat("Output files created:\n") -cat("- correlation_plot_scales_spearman.pdf (primary analysis)\n") -cat("- correlation_plot_scales_pearson.pdf (comparison)\n") -cat("\nSpearman correlations are reported as the primary analysis.\n") \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007233311.r b/.history/eohi1/correlations - scales_20251007233311.r deleted file mode 100644 index 42f68c2..0000000 --- a/.history/eohi1/correlations - scales_20251007233311.r +++ /dev/null @@ -1,66 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007233319.r b/.history/eohi1/correlations - scales_20251007233319.r deleted file mode 100644 index 42f68c2..0000000 --- a/.history/eohi1/correlations - scales_20251007233319.r +++ /dev/null @@ -1,66 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007233424.r b/.history/eohi1/correlations - scales_20251007233424.r deleted file mode 100644 index 42f68c2..0000000 --- a/.history/eohi1/correlations - scales_20251007233424.r +++ /dev/null @@ -1,66 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007233541.r b/.history/eohi1/correlations - scales_20251007233541.r deleted file mode 100644 index c4f764e..0000000 --- a/.history/eohi1/correlations - scales_20251007233541.r +++ /dev/null @@ -1,171 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -cat("=== NORMALITY DIAGNOSTICS ===\n") -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } else { - cat(sprintf("%s: Sample too large for Shapiro-Wilk test\n", var)) - } -} - -# Kolmogorov-Smirnov test for normality -cat("\nKolmogorov-Smirnov tests:\n") -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -cat("\n=== LINEARITY DIAGNOSTICS ===\n") -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -cat("\nResidual Analysis (checking for non-linear patterns):\n") -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - -cat("\n=== INTERPRETATION GUIDE ===\n") -cat("NORMALITY: p < 0.05 = NOT normal (use Spearman)\n") -cat("LINEARITY: R² < 0.7 = weak linear relationship (consider Spearman)\n") -cat("Check plots for: curved patterns, outliers, non-normal distributions\n") -cat("Files created: normality_plots.pdf, linearity_plots.pdf, residual_plots.pdf\n\n") - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007233548.r b/.history/eohi1/correlations - scales_20251007233548.r deleted file mode 100644 index c4f764e..0000000 --- a/.history/eohi1/correlations - scales_20251007233548.r +++ /dev/null @@ -1,171 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -cat("=== NORMALITY DIAGNOSTICS ===\n") -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } else { - cat(sprintf("%s: Sample too large for Shapiro-Wilk test\n", var)) - } -} - -# Kolmogorov-Smirnov test for normality -cat("\nKolmogorov-Smirnov tests:\n") -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -cat("\n=== LINEARITY DIAGNOSTICS ===\n") -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -cat("\nResidual Analysis (checking for non-linear patterns):\n") -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - -cat("\n=== INTERPRETATION GUIDE ===\n") -cat("NORMALITY: p < 0.05 = NOT normal (use Spearman)\n") -cat("LINEARITY: R² < 0.7 = weak linear relationship (consider Spearman)\n") -cat("Check plots for: curved patterns, outliers, non-normal distributions\n") -cat("Files created: normality_plots.pdf, linearity_plots.pdf, residual_plots.pdf\n\n") - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007233652.r b/.history/eohi1/correlations - scales_20251007233652.r deleted file mode 100644 index c4f764e..0000000 --- a/.history/eohi1/correlations - scales_20251007233652.r +++ /dev/null @@ -1,171 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -cat("=== NORMALITY DIAGNOSTICS ===\n") -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } else { - cat(sprintf("%s: Sample too large for Shapiro-Wilk test\n", var)) - } -} - -# Kolmogorov-Smirnov test for normality -cat("\nKolmogorov-Smirnov tests:\n") -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -cat("\n=== LINEARITY DIAGNOSTICS ===\n") -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -cat("\nResidual Analysis (checking for non-linear patterns):\n") -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - -cat("\n=== INTERPRETATION GUIDE ===\n") -cat("NORMALITY: p < 0.05 = NOT normal (use Spearman)\n") -cat("LINEARITY: R² < 0.7 = weak linear relationship (consider Spearman)\n") -cat("Check plots for: curved patterns, outliers, non-normal distributions\n") -cat("Files created: normality_plots.pdf, linearity_plots.pdf, residual_plots.pdf\n\n") - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007233731.r b/.history/eohi1/correlations - scales_20251007233731.r deleted file mode 100644 index 6df4dcf..0000000 --- a/.history/eohi1/correlations - scales_20251007233731.r +++ /dev/null @@ -1,167 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -cat("\n=== LINEARITY DIAGNOSTICS ===\n") -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -cat("\nResidual Analysis (checking for non-linear patterns):\n") -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - -cat("\n=== INTERPRETATION GUIDE ===\n") -cat("NORMALITY: p < 0.05 = NOT normal (use Spearman)\n") -cat("LINEARITY: R² < 0.7 = weak linear relationship (consider Spearman)\n") -cat("Check plots for: curved patterns, outliers, non-normal distributions\n") -cat("Files created: normality_plots.pdf, linearity_plots.pdf, residual_plots.pdf\n\n") - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007233734.r b/.history/eohi1/correlations - scales_20251007233734.r deleted file mode 100644 index c8c34ea..0000000 --- a/.history/eohi1/correlations - scales_20251007233734.r +++ /dev/null @@ -1,166 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -cat("\nResidual Analysis (checking for non-linear patterns):\n") -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - -cat("\n=== INTERPRETATION GUIDE ===\n") -cat("NORMALITY: p < 0.05 = NOT normal (use Spearman)\n") -cat("LINEARITY: R² < 0.7 = weak linear relationship (consider Spearman)\n") -cat("Check plots for: curved patterns, outliers, non-normal distributions\n") -cat("Files created: normality_plots.pdf, linearity_plots.pdf, residual_plots.pdf\n\n") - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007233736.r b/.history/eohi1/correlations - scales_20251007233736.r deleted file mode 100644 index c47abbd..0000000 --- a/.history/eohi1/correlations - scales_20251007233736.r +++ /dev/null @@ -1,165 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - -cat("\n=== INTERPRETATION GUIDE ===\n") -cat("NORMALITY: p < 0.05 = NOT normal (use Spearman)\n") -cat("LINEARITY: R² < 0.7 = weak linear relationship (consider Spearman)\n") -cat("Check plots for: curved patterns, outliers, non-normal distributions\n") -cat("Files created: normality_plots.pdf, linearity_plots.pdf, residual_plots.pdf\n\n") - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007233739.r b/.history/eohi1/correlations - scales_20251007233739.r deleted file mode 100644 index 7e178c8..0000000 --- a/.history/eohi1/correlations - scales_20251007233739.r +++ /dev/null @@ -1,160 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007233744.r b/.history/eohi1/correlations - scales_20251007233744.r deleted file mode 100644 index 7e178c8..0000000 --- a/.history/eohi1/correlations - scales_20251007233744.r +++ /dev/null @@ -1,160 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007233749.r b/.history/eohi1/correlations - scales_20251007233749.r deleted file mode 100644 index 7e178c8..0000000 --- a/.history/eohi1/correlations - scales_20251007233749.r +++ /dev/null @@ -1,160 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007234152.r b/.history/eohi1/correlations - scales_20251007234152.r deleted file mode 100644 index a158a21..0000000 --- a/.history/eohi1/correlations - scales_20251007234152.r +++ /dev/null @@ -1,194 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(cor_matrix_pearson, 5), "pearson_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlation_results_formatted.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007234202.r b/.history/eohi1/correlations - scales_20251007234202.r deleted file mode 100644 index a158a21..0000000 --- a/.history/eohi1/correlations - scales_20251007234202.r +++ /dev/null @@ -1,194 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(cor_matrix_pearson, 5), "pearson_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlation_results_formatted.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007234339.r b/.history/eohi1/correlations - scales_20251007234339.r deleted file mode 100644 index a158a21..0000000 --- a/.history/eohi1/correlations - scales_20251007234339.r +++ /dev/null @@ -1,194 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(cor_matrix_pearson, 5), "pearson_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlation_results_formatted.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251007234406.r b/.history/eohi1/correlations - scales_20251007234406.r deleted file mode 100644 index 5c44871..0000000 --- a/.history/eohi1/correlations - scales_20251007234406.r +++ /dev/null @@ -1,194 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(cor_matrix_pearson, 5), "pearson_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlation_exp1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251008001047.r b/.history/eohi1/correlations - scales_20251008001047.r deleted file mode 100644 index d8606fe..0000000 --- a/.history/eohi1/correlations - scales_20251008001047.r +++ /dev/null @@ -1,194 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean", "domain_mean", "DGEN_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(cor_matrix_pearson, 5), "pearson_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlation_exp1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251008001054.r b/.history/eohi1/correlations - scales_20251008001054.r deleted file mode 100644 index d8606fe..0000000 --- a/.history/eohi1/correlations - scales_20251008001054.r +++ /dev/null @@ -1,194 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean", "domain_mean", "DGEN_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(cor_matrix_pearson, 5), "pearson_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlation_exp1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251008001102.r b/.history/eohi1/correlations - scales_20251008001102.r deleted file mode 100644 index d8606fe..0000000 --- a/.history/eohi1/correlations - scales_20251008001102.r +++ /dev/null @@ -1,194 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean", "domain_mean", "DGEN_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(cor_matrix_pearson, 5), "pearson_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlation_exp1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251008005438.r b/.history/eohi1/correlations - scales_20251008005438.r deleted file mode 100644 index d8606fe..0000000 --- a/.history/eohi1/correlations - scales_20251008005438.r +++ /dev/null @@ -1,194 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean", "domain_mean", "DGEN_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(cor_matrix_pearson, 5), "pearson_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlation_exp1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251008154804.r b/.history/eohi1/correlations - scales_20251008154804.r deleted file mode 100644 index d8606fe..0000000 --- a/.history/eohi1/correlations - scales_20251008154804.r +++ /dev/null @@ -1,194 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_mean_total", "NFut_mean_total", "DGEN_past_mean", "DGEN_fut_mean", "domain_mean", "DGEN_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(cor_matrix_pearson, 5), "pearson_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlation_exp1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251008154934.r b/.history/eohi1/correlations - scales_20251008154934.r deleted file mode 100644 index e034766..0000000 --- a/.history/eohi1/correlations - scales_20251008154934.r +++ /dev/null @@ -1,195 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(cor_matrix_pearson, 5), "pearson_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlation_exp1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251008154940.r b/.history/eohi1/correlations - scales_20251008154940.r deleted file mode 100644 index 96f47d2..0000000 --- a/.history/eohi1/correlations - scales_20251008154940.r +++ /dev/null @@ -1,191 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrix (Spearman only) -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Print correlation matrix with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_scales_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(cor_matrix_pearson, 5), "pearson_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlation_exp1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251008154947.r b/.history/eohi1/correlations - scales_20251008154947.r deleted file mode 100644 index 7678af9..0000000 --- a/.history/eohi1/correlations - scales_20251008154947.r +++ /dev/null @@ -1,184 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrix (Spearman only) -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Print correlation matrix with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot (Spearman only) -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(cor_matrix_pearson, 5), "pearson_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlation_exp1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251008154952.r b/.history/eohi1/correlations - scales_20251008154952.r deleted file mode 100644 index 31a10e3..0000000 --- a/.history/eohi1/correlations - scales_20251008154952.r +++ /dev/null @@ -1,183 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrix (Spearman only) -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Print correlation matrix with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot (Spearman only) -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlation_exp1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251008155000.r b/.history/eohi1/correlations - scales_20251008155000.r deleted file mode 100644 index 31a10e3..0000000 --- a/.history/eohi1/correlations - scales_20251008155000.r +++ /dev/null @@ -1,183 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrix (Spearman only) -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Print correlation matrix with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot (Spearman only) -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlation_exp1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251008155103.r b/.history/eohi1/correlations - scales_20251008155103.r deleted file mode 100644 index 31a10e3..0000000 --- a/.history/eohi1/correlations - scales_20251008155103.r +++ /dev/null @@ -1,183 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the two sets of variables -set1_vars <- c("eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrix (Spearman only) -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Print correlation matrix with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot (Spearman only) -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlation_exp1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251008155139.r b/.history/eohi1/correlations - scales_20251008155139.r deleted file mode 100644 index 8a936df..0000000 --- a/.history/eohi1/correlations - scales_20251008155139.r +++ /dev/null @@ -1,183 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("ehi1.csv") - -# Define the two sets of variables -set1_vars <- c("eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrix (Spearman only) -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Print correlation matrix with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot (Spearman only) -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlation_exp1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251008155218.r b/.history/eohi1/correlations - scales_20251008155218.r deleted file mode 100644 index 048bbb2..0000000 --- a/.history/eohi1/correlations - scales_20251008155218.r +++ /dev/null @@ -1,171 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("ehi1.csv") - -# Define the two sets of variables -set1_vars <- c("eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality -for(var in names(correlation_data)) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrix (Spearman only) -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Print correlation matrix with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot (Spearman only) -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlation_exp1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251008155221.r b/.history/eohi1/correlations - scales_20251008155221.r deleted file mode 100644 index 048bbb2..0000000 --- a/.history/eohi1/correlations - scales_20251008155221.r +++ /dev/null @@ -1,171 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("ehi1.csv") - -# Define the two sets of variables -set1_vars <- c("eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality -for(var in names(correlation_data)) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrix (Spearman only) -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Print correlation matrix with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot (Spearman only) -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlation_exp1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251008155223.r b/.history/eohi1/correlations - scales_20251008155223.r deleted file mode 100644 index 048bbb2..0000000 --- a/.history/eohi1/correlations - scales_20251008155223.r +++ /dev/null @@ -1,171 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("ehi1.csv") - -# Define the two sets of variables -set1_vars <- c("eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality -for(var in names(correlation_data)) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrix (Spearman only) -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Print correlation matrix with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot (Spearman only) -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlation_exp1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251008155225.r b/.history/eohi1/correlations - scales_20251008155225.r deleted file mode 100644 index 048bbb2..0000000 --- a/.history/eohi1/correlations - scales_20251008155225.r +++ /dev/null @@ -1,171 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("ehi1.csv") - -# Define the two sets of variables -set1_vars <- c("eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality -for(var in names(correlation_data)) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrix (Spearman only) -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Print correlation matrix with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot (Spearman only) -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlation_exp1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/correlations - scales_20251008171710.r b/.history/eohi1/correlations - scales_20251008171710.r deleted file mode 100644 index 048bbb2..0000000 --- a/.history/eohi1/correlations - scales_20251008171710.r +++ /dev/null @@ -1,171 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("ehi1.csv") - -# Define the two sets of variables -set1_vars <- c("eohiDGEN_pref", "eohiDGEN_pers", "eohiDGEN_val", "eohiDGEN_life", "eohiDGEN_mean", - "ehi_pref_mean", "ehi_pers_mean", "ehi_val_mean", "ehi_life_mean", "ehi_global_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality -for(var in names(correlation_data)) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrix (Spearman only) -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Print correlation matrix with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot (Spearman only) -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlation_exp1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/dataP 02 - cor means average over time frames_20251008000048.r b/.history/eohi1/dataP 02 - cor means average over time frames_20251008000048.r deleted file mode 100644 index 490ab62..0000000 --- a/.history/eohi1/dataP 02 - cor means average over time frames_20251008000048.r +++ /dev/null @@ -1,45 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the NPastDiff variables by domain -nPastDiff_pref <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel") -nPastDiff_pers <- c("NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", "NPastDiff_pers_anxious", "NPastDiff_pers_complex") -nPastDiff_val <- c("NPastDiff_val_obey", "NPastDiff_val_trad", "NPastDiff_val_opinion", "NPastDiff_val_performance", "NPastDiff_val_justice") -nPastDiff_life <- c("NPastDiff_life_ideal", "NPastDiff_life_excellent", "NPastDiff_life_satisfied", "NPastDiff_life_important", "NPastDiff_life_change") - -# Define the NFutDiff variables by domain -nFutDiff_pref <- c("NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel") -nFutDiff_pers <- c("NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", "NFutDiff_pers_anxious", "NFutDiff_pers_complex") -nFutDiff_val <- c("NFutDiff_val_obey", "NFutDiff_val_trad", "NFutDiff_val_opinion", "NFutDiff_val_performance", "NFutDiff_val_justice") -nFutDiff_life <- c("NFutDiff_life_ideal", "NFutDiff_life_excellent", "NFutDiff_life_satisfied", "NFutDiff_life_important", "NFutDiff_life_change") - -# Calculate domain means for NPastDiff -exp1_data$NPastDiff_pref_mean <- rowMeans(exp1_data[, nPastDiff_pref], na.rm = TRUE) -exp1_data$NPastDiff_pers_mean <- rowMeans(exp1_data[, nPastDiff_pers], na.rm = TRUE) -exp1_data$NPastDiff_val_mean <- rowMeans(exp1_data[, nPastDiff_val], na.rm = TRUE) -exp1_data$NPastDiff_life_mean <- rowMeans(exp1_data[, nPastDiff_life], na.rm = TRUE) - -# Calculate domain means for NFutDiff -exp1_data$NFutDiff_pref_mean <- rowMeans(exp1_data[, nFutDiff_pref], na.rm = TRUE) -exp1_data$NFutDiff_pers_mean <- rowMeans(exp1_data[, nFutDiff_pers], na.rm = TRUE) -exp1_data$NFutDiff_val_mean <- rowMeans(exp1_data[, nFutDiff_val], na.rm = TRUE) -exp1_data$NFutDiff_life_mean <- rowMeans(exp1_data[, nFutDiff_life], na.rm = TRUE) - -# Save the updated data -write.csv(exp1_data, "exp1.csv", row.names = FALSE) - -# Display summary of the calculated domain means -cat("NPastDiff domain means summary:\n") -summary(exp1_data[, c("NPastDiff_pref_mean", "NPastDiff_pers_mean", "NPastDiff_val_mean", "NPastDiff_life_mean")]) - -cat("\nNFutDiff domain means summary:\n") -summary(exp1_data[, c("NFutDiff_pref_mean", "NFutDiff_pers_mean", "NFutDiff_val_mean", "NFutDiff_life_mean")]) - -# Show first few rows to verify calculations -cat("\nFirst 5 rows of calculated domain means:\n") -domain_means_cols <- c("NPastDiff_pref_mean", "NPastDiff_pers_mean", "NPastDiff_val_mean", "NPastDiff_life_mean", - "NFutDiff_pref_mean", "NFutDiff_pers_mean", "NFutDiff_val_mean", "NFutDiff_life_mean") -print(exp1_data[1:5, domain_means_cols]) diff --git a/.history/eohi1/dataP 02 - cor means average over time frames_20251008000055.r b/.history/eohi1/dataP 02 - cor means average over time frames_20251008000055.r deleted file mode 100644 index 490ab62..0000000 --- a/.history/eohi1/dataP 02 - cor means average over time frames_20251008000055.r +++ /dev/null @@ -1,45 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define the NPastDiff variables by domain -nPastDiff_pref <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel") -nPastDiff_pers <- c("NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", "NPastDiff_pers_anxious", "NPastDiff_pers_complex") -nPastDiff_val <- c("NPastDiff_val_obey", "NPastDiff_val_trad", "NPastDiff_val_opinion", "NPastDiff_val_performance", "NPastDiff_val_justice") -nPastDiff_life <- c("NPastDiff_life_ideal", "NPastDiff_life_excellent", "NPastDiff_life_satisfied", "NPastDiff_life_important", "NPastDiff_life_change") - -# Define the NFutDiff variables by domain -nFutDiff_pref <- c("NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel") -nFutDiff_pers <- c("NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", "NFutDiff_pers_anxious", "NFutDiff_pers_complex") -nFutDiff_val <- c("NFutDiff_val_obey", "NFutDiff_val_trad", "NFutDiff_val_opinion", "NFutDiff_val_performance", "NFutDiff_val_justice") -nFutDiff_life <- c("NFutDiff_life_ideal", "NFutDiff_life_excellent", "NFutDiff_life_satisfied", "NFutDiff_life_important", "NFutDiff_life_change") - -# Calculate domain means for NPastDiff -exp1_data$NPastDiff_pref_mean <- rowMeans(exp1_data[, nPastDiff_pref], na.rm = TRUE) -exp1_data$NPastDiff_pers_mean <- rowMeans(exp1_data[, nPastDiff_pers], na.rm = TRUE) -exp1_data$NPastDiff_val_mean <- rowMeans(exp1_data[, nPastDiff_val], na.rm = TRUE) -exp1_data$NPastDiff_life_mean <- rowMeans(exp1_data[, nPastDiff_life], na.rm = TRUE) - -# Calculate domain means for NFutDiff -exp1_data$NFutDiff_pref_mean <- rowMeans(exp1_data[, nFutDiff_pref], na.rm = TRUE) -exp1_data$NFutDiff_pers_mean <- rowMeans(exp1_data[, nFutDiff_pers], na.rm = TRUE) -exp1_data$NFutDiff_val_mean <- rowMeans(exp1_data[, nFutDiff_val], na.rm = TRUE) -exp1_data$NFutDiff_life_mean <- rowMeans(exp1_data[, nFutDiff_life], na.rm = TRUE) - -# Save the updated data -write.csv(exp1_data, "exp1.csv", row.names = FALSE) - -# Display summary of the calculated domain means -cat("NPastDiff domain means summary:\n") -summary(exp1_data[, c("NPastDiff_pref_mean", "NPastDiff_pers_mean", "NPastDiff_val_mean", "NPastDiff_life_mean")]) - -cat("\nNFutDiff domain means summary:\n") -summary(exp1_data[, c("NFutDiff_pref_mean", "NFutDiff_pers_mean", "NFutDiff_val_mean", "NFutDiff_life_mean")]) - -# Show first few rows to verify calculations -cat("\nFirst 5 rows of calculated domain means:\n") -domain_means_cols <- c("NPastDiff_pref_mean", "NPastDiff_pers_mean", "NPastDiff_val_mean", "NPastDiff_life_mean", - "NFutDiff_pref_mean", "NFutDiff_pers_mean", "NFutDiff_val_mean", "NFutDiff_life_mean") -print(exp1_data[1:5, domain_means_cols]) diff --git a/.history/eohi1/dataP 02 - cor means average over time frames_20251008000150.r b/.history/eohi1/dataP 02 - cor means average over time frames_20251008000150.r deleted file mode 100644 index 4fb2e07..0000000 --- a/.history/eohi1/dataP 02 - cor means average over time frames_20251008000150.r +++ /dev/null @@ -1,36 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define all NPastDiff and NFutDiff variables -all_diff_vars <- c( - "NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", "NPastDiff_pers_anxious", "NPastDiff_pers_complex", - "NPastDiff_val_obey", "NPastDiff_val_trad", "NPastDiff_val_opinion", "NPastDiff_val_performance", "NPastDiff_val_justice", - "NPastDiff_life_ideal", "NPastDiff_life_excellent", "NPastDiff_life_satisfied", "NPastDiff_life_important", "NPastDiff_life_change", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel", - "NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", "NFutDiff_pers_anxious", "NFutDiff_pers_complex", - "NFutDiff_val_obey", "NFutDiff_val_trad", "NFutDiff_val_opinion", "NFutDiff_val_performance", "NFutDiff_val_justice", - "NFutDiff_life_ideal", "NFutDiff_life_excellent", "NFutDiff_life_satisfied", "NFutDiff_life_important", "NFutDiff_life_change" -) - -# Calculate domain_mean as average of all 40 variables -exp1_data$domain_mean <- rowMeans(exp1_data[, all_diff_vars], na.rm = TRUE) - -# Save the updated data -write.csv(exp1_data, "exp1.csv", row.names = FALSE) - -# Display summary of the calculated domain means -cat("NPastDiff domain means summary:\n") -summary(exp1_data[, c("NPastDiff_pref_mean", "NPastDiff_pers_mean", "NPastDiff_val_mean", "NPastDiff_life_mean")]) - -cat("\nNFutDiff domain means summary:\n") -summary(exp1_data[, c("NFutDiff_pref_mean", "NFutDiff_pers_mean", "NFutDiff_val_mean", "NFutDiff_life_mean")]) - -# Show first few rows to verify calculations -cat("\nFirst 5 rows of calculated domain means:\n") -domain_means_cols <- c("NPastDiff_pref_mean", "NPastDiff_pers_mean", "NPastDiff_val_mean", "NPastDiff_life_mean", - "NFutDiff_pref_mean", "NFutDiff_pers_mean", "NFutDiff_val_mean", "NFutDiff_life_mean") -print(exp1_data[1:5, domain_means_cols]) diff --git a/.history/eohi1/dataP 02 - cor means average over time frames_20251008000158.r b/.history/eohi1/dataP 02 - cor means average over time frames_20251008000158.r deleted file mode 100644 index e310370..0000000 --- a/.history/eohi1/dataP 02 - cor means average over time frames_20251008000158.r +++ /dev/null @@ -1,31 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define all NPastDiff and NFutDiff variables -all_diff_vars <- c( - "NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", "NPastDiff_pers_anxious", "NPastDiff_pers_complex", - "NPastDiff_val_obey", "NPastDiff_val_trad", "NPastDiff_val_opinion", "NPastDiff_val_performance", "NPastDiff_val_justice", - "NPastDiff_life_ideal", "NPastDiff_life_excellent", "NPastDiff_life_satisfied", "NPastDiff_life_important", "NPastDiff_life_change", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel", - "NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", "NFutDiff_pers_anxious", "NFutDiff_pers_complex", - "NFutDiff_val_obey", "NFutDiff_val_trad", "NFutDiff_val_opinion", "NFutDiff_val_performance", "NFutDiff_val_justice", - "NFutDiff_life_ideal", "NFutDiff_life_excellent", "NFutDiff_life_satisfied", "NFutDiff_life_important", "NFutDiff_life_change" -) - -# Calculate domain_mean as average of all 40 variables -exp1_data$domain_mean <- rowMeans(exp1_data[, all_diff_vars], na.rm = TRUE) - -# Save the updated data -write.csv(exp1_data, "exp1.csv", row.names = FALSE) - -# Display summary of the calculated domain_mean -cat("Domain mean summary (average of all 40 NPastDiff and NFutDiff variables):\n") -summary(exp1_data$domain_mean) - -# Show first few rows to verify calculations -cat("\nFirst 5 rows of calculated domain_mean:\n") -print(exp1_data[1:5, "domain_mean"]) diff --git a/.history/eohi1/dataP 02 - cor means average over time frames_20251008000203.r b/.history/eohi1/dataP 02 - cor means average over time frames_20251008000203.r deleted file mode 100644 index e310370..0000000 --- a/.history/eohi1/dataP 02 - cor means average over time frames_20251008000203.r +++ /dev/null @@ -1,31 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define all NPastDiff and NFutDiff variables -all_diff_vars <- c( - "NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", "NPastDiff_pers_anxious", "NPastDiff_pers_complex", - "NPastDiff_val_obey", "NPastDiff_val_trad", "NPastDiff_val_opinion", "NPastDiff_val_performance", "NPastDiff_val_justice", - "NPastDiff_life_ideal", "NPastDiff_life_excellent", "NPastDiff_life_satisfied", "NPastDiff_life_important", "NPastDiff_life_change", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel", - "NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", "NFutDiff_pers_anxious", "NFutDiff_pers_complex", - "NFutDiff_val_obey", "NFutDiff_val_trad", "NFutDiff_val_opinion", "NFutDiff_val_performance", "NFutDiff_val_justice", - "NFutDiff_life_ideal", "NFutDiff_life_excellent", "NFutDiff_life_satisfied", "NFutDiff_life_important", "NFutDiff_life_change" -) - -# Calculate domain_mean as average of all 40 variables -exp1_data$domain_mean <- rowMeans(exp1_data[, all_diff_vars], na.rm = TRUE) - -# Save the updated data -write.csv(exp1_data, "exp1.csv", row.names = FALSE) - -# Display summary of the calculated domain_mean -cat("Domain mean summary (average of all 40 NPastDiff and NFutDiff variables):\n") -summary(exp1_data$domain_mean) - -# Show first few rows to verify calculations -cat("\nFirst 5 rows of calculated domain_mean:\n") -print(exp1_data[1:5, "domain_mean"]) diff --git a/.history/eohi1/dataP 02 - cor means average over time frames_20251008000212.r b/.history/eohi1/dataP 02 - cor means average over time frames_20251008000212.r deleted file mode 100644 index e310370..0000000 --- a/.history/eohi1/dataP 02 - cor means average over time frames_20251008000212.r +++ /dev/null @@ -1,31 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define all NPastDiff and NFutDiff variables -all_diff_vars <- c( - "NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", "NPastDiff_pers_anxious", "NPastDiff_pers_complex", - "NPastDiff_val_obey", "NPastDiff_val_trad", "NPastDiff_val_opinion", "NPastDiff_val_performance", "NPastDiff_val_justice", - "NPastDiff_life_ideal", "NPastDiff_life_excellent", "NPastDiff_life_satisfied", "NPastDiff_life_important", "NPastDiff_life_change", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel", - "NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", "NFutDiff_pers_anxious", "NFutDiff_pers_complex", - "NFutDiff_val_obey", "NFutDiff_val_trad", "NFutDiff_val_opinion", "NFutDiff_val_performance", "NFutDiff_val_justice", - "NFutDiff_life_ideal", "NFutDiff_life_excellent", "NFutDiff_life_satisfied", "NFutDiff_life_important", "NFutDiff_life_change" -) - -# Calculate domain_mean as average of all 40 variables -exp1_data$domain_mean <- rowMeans(exp1_data[, all_diff_vars], na.rm = TRUE) - -# Save the updated data -write.csv(exp1_data, "exp1.csv", row.names = FALSE) - -# Display summary of the calculated domain_mean -cat("Domain mean summary (average of all 40 NPastDiff and NFutDiff variables):\n") -summary(exp1_data$domain_mean) - -# Show first few rows to verify calculations -cat("\nFirst 5 rows of calculated domain_mean:\n") -print(exp1_data[1:5, "domain_mean"]) diff --git a/.history/eohi1/dataP 02 - cor means average over time frames_20251008000542.r b/.history/eohi1/dataP 02 - cor means average over time frames_20251008000542.r deleted file mode 100644 index 8049c0a..0000000 --- a/.history/eohi1/dataP 02 - cor means average over time frames_20251008000542.r +++ /dev/null @@ -1,38 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define all NPastDiff and NFutDiff variables -all_diff_vars <- c( - "NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", "NPastDiff_pers_anxious", "NPastDiff_pers_complex", - "NPastDiff_val_obey", "NPastDiff_val_trad", "NPastDiff_val_opinion", "NPastDiff_val_performance", "NPastDiff_val_justice", - "NPastDiff_life_ideal", "NPastDiff_life_excellent", "NPastDiff_life_satisfied", "NPastDiff_life_important", "NPastDiff_life_change", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel", - "NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", "NFutDiff_pers_anxious", "NFutDiff_pers_complex", - "NFutDiff_val_obey", "NFutDiff_val_trad", "NFutDiff_val_opinion", "NFutDiff_val_performance", "NFutDiff_val_justice", - "NFutDiff_life_ideal", "NFutDiff_life_excellent", "NFutDiff_life_satisfied", "NFutDiff_life_important", "NFutDiff_life_change" -) - -# Define DGEN variables to average -dgen_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -# Calculate domain_mean as average of all 40 variables -exp1_data$domain_mean <- rowMeans(exp1_data[, all_diff_vars], na.rm = TRUE) - -# Calculate DGEN_mean as average of all 8 DGEN variables -exp1_data$DGEN_mean <- rowMeans(exp1_data[, dgen_vars], na.rm = TRUE) - -# Save the updated data -write.csv(exp1_data, "exp1.csv", row.names = FALSE) - -# Display summary of the calculated domain_mean -cat("Domain mean summary (average of all 40 NPastDiff and NFutDiff variables):\n") -summary(exp1_data$domain_mean) - -# Show first few rows to verify calculations -cat("\nFirst 5 rows of calculated domain_mean:\n") -print(exp1_data[1:5, "domain_mean"]) diff --git a/.history/eohi1/dataP 02 - cor means average over time frames_20251008000547.r b/.history/eohi1/dataP 02 - cor means average over time frames_20251008000547.r deleted file mode 100644 index 5a65108..0000000 --- a/.history/eohi1/dataP 02 - cor means average over time frames_20251008000547.r +++ /dev/null @@ -1,45 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define all NPastDiff and NFutDiff variables -all_diff_vars <- c( - "NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", "NPastDiff_pers_anxious", "NPastDiff_pers_complex", - "NPastDiff_val_obey", "NPastDiff_val_trad", "NPastDiff_val_opinion", "NPastDiff_val_performance", "NPastDiff_val_justice", - "NPastDiff_life_ideal", "NPastDiff_life_excellent", "NPastDiff_life_satisfied", "NPastDiff_life_important", "NPastDiff_life_change", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel", - "NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", "NFutDiff_pers_anxious", "NFutDiff_pers_complex", - "NFutDiff_val_obey", "NFutDiff_val_trad", "NFutDiff_val_opinion", "NFutDiff_val_performance", "NFutDiff_val_justice", - "NFutDiff_life_ideal", "NFutDiff_life_excellent", "NFutDiff_life_satisfied", "NFutDiff_life_important", "NFutDiff_life_change" -) - -# Define DGEN variables to average -dgen_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -# Calculate domain_mean as average of all 40 variables -exp1_data$domain_mean <- rowMeans(exp1_data[, all_diff_vars], na.rm = TRUE) - -# Calculate DGEN_mean as average of all 8 DGEN variables -exp1_data$DGEN_mean <- rowMeans(exp1_data[, dgen_vars], na.rm = TRUE) - -# Save the updated data -write.csv(exp1_data, "exp1.csv", row.names = FALSE) - -# Display summary of the calculated means -cat("Domain mean summary (average of all 40 NPastDiff and NFutDiff variables):\n") -summary(exp1_data$domain_mean) - -cat("\nDGEN mean summary (average of all 8 DGEN variables):\n") -summary(exp1_data$DGEN_mean) - -# Show first few rows to verify calculations -cat("\nFirst 5 rows of calculated means:\n") -print(exp1_data[1:5, c("domain_mean", "DGEN_mean")]) - -# Show the individual DGEN values for first 5 rows to verify math -cat("\nFirst 5 rows of individual DGEN values for verification:\n") -print(exp1_data[1:5, dgen_vars]) diff --git a/.history/eohi1/dataP 02 - cor means average over time frames_20251008000552.r b/.history/eohi1/dataP 02 - cor means average over time frames_20251008000552.r deleted file mode 100644 index 5a65108..0000000 --- a/.history/eohi1/dataP 02 - cor means average over time frames_20251008000552.r +++ /dev/null @@ -1,45 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define all NPastDiff and NFutDiff variables -all_diff_vars <- c( - "NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", "NPastDiff_pers_anxious", "NPastDiff_pers_complex", - "NPastDiff_val_obey", "NPastDiff_val_trad", "NPastDiff_val_opinion", "NPastDiff_val_performance", "NPastDiff_val_justice", - "NPastDiff_life_ideal", "NPastDiff_life_excellent", "NPastDiff_life_satisfied", "NPastDiff_life_important", "NPastDiff_life_change", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel", - "NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", "NFutDiff_pers_anxious", "NFutDiff_pers_complex", - "NFutDiff_val_obey", "NFutDiff_val_trad", "NFutDiff_val_opinion", "NFutDiff_val_performance", "NFutDiff_val_justice", - "NFutDiff_life_ideal", "NFutDiff_life_excellent", "NFutDiff_life_satisfied", "NFutDiff_life_important", "NFutDiff_life_change" -) - -# Define DGEN variables to average -dgen_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -# Calculate domain_mean as average of all 40 variables -exp1_data$domain_mean <- rowMeans(exp1_data[, all_diff_vars], na.rm = TRUE) - -# Calculate DGEN_mean as average of all 8 DGEN variables -exp1_data$DGEN_mean <- rowMeans(exp1_data[, dgen_vars], na.rm = TRUE) - -# Save the updated data -write.csv(exp1_data, "exp1.csv", row.names = FALSE) - -# Display summary of the calculated means -cat("Domain mean summary (average of all 40 NPastDiff and NFutDiff variables):\n") -summary(exp1_data$domain_mean) - -cat("\nDGEN mean summary (average of all 8 DGEN variables):\n") -summary(exp1_data$DGEN_mean) - -# Show first few rows to verify calculations -cat("\nFirst 5 rows of calculated means:\n") -print(exp1_data[1:5, c("domain_mean", "DGEN_mean")]) - -# Show the individual DGEN values for first 5 rows to verify math -cat("\nFirst 5 rows of individual DGEN values for verification:\n") -print(exp1_data[1:5, dgen_vars]) diff --git a/.history/eohi1/dataP 02 - cor means average over time frames_20251008000600.r b/.history/eohi1/dataP 02 - cor means average over time frames_20251008000600.r deleted file mode 100644 index 5a65108..0000000 --- a/.history/eohi1/dataP 02 - cor means average over time frames_20251008000600.r +++ /dev/null @@ -1,45 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define all NPastDiff and NFutDiff variables -all_diff_vars <- c( - "NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", "NPastDiff_pers_anxious", "NPastDiff_pers_complex", - "NPastDiff_val_obey", "NPastDiff_val_trad", "NPastDiff_val_opinion", "NPastDiff_val_performance", "NPastDiff_val_justice", - "NPastDiff_life_ideal", "NPastDiff_life_excellent", "NPastDiff_life_satisfied", "NPastDiff_life_important", "NPastDiff_life_change", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel", - "NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", "NFutDiff_pers_anxious", "NFutDiff_pers_complex", - "NFutDiff_val_obey", "NFutDiff_val_trad", "NFutDiff_val_opinion", "NFutDiff_val_performance", "NFutDiff_val_justice", - "NFutDiff_life_ideal", "NFutDiff_life_excellent", "NFutDiff_life_satisfied", "NFutDiff_life_important", "NFutDiff_life_change" -) - -# Define DGEN variables to average -dgen_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -# Calculate domain_mean as average of all 40 variables -exp1_data$domain_mean <- rowMeans(exp1_data[, all_diff_vars], na.rm = TRUE) - -# Calculate DGEN_mean as average of all 8 DGEN variables -exp1_data$DGEN_mean <- rowMeans(exp1_data[, dgen_vars], na.rm = TRUE) - -# Save the updated data -write.csv(exp1_data, "exp1.csv", row.names = FALSE) - -# Display summary of the calculated means -cat("Domain mean summary (average of all 40 NPastDiff and NFutDiff variables):\n") -summary(exp1_data$domain_mean) - -cat("\nDGEN mean summary (average of all 8 DGEN variables):\n") -summary(exp1_data$DGEN_mean) - -# Show first few rows to verify calculations -cat("\nFirst 5 rows of calculated means:\n") -print(exp1_data[1:5, c("domain_mean", "DGEN_mean")]) - -# Show the individual DGEN values for first 5 rows to verify math -cat("\nFirst 5 rows of individual DGEN values for verification:\n") -print(exp1_data[1:5, dgen_vars]) diff --git a/.history/eohi1/dataP 02 - cor means average over time frames_20251008000956.r b/.history/eohi1/dataP 02 - cor means average over time frames_20251008000956.r deleted file mode 100644 index 5a65108..0000000 --- a/.history/eohi1/dataP 02 - cor means average over time frames_20251008000956.r +++ /dev/null @@ -1,45 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load the data -exp1_data <- read.csv("exp1.csv") - -# Define all NPastDiff and NFutDiff variables -all_diff_vars <- c( - "NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", "NPastDiff_pers_anxious", "NPastDiff_pers_complex", - "NPastDiff_val_obey", "NPastDiff_val_trad", "NPastDiff_val_opinion", "NPastDiff_val_performance", "NPastDiff_val_justice", - "NPastDiff_life_ideal", "NPastDiff_life_excellent", "NPastDiff_life_satisfied", "NPastDiff_life_important", "NPastDiff_life_change", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel", - "NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", "NFutDiff_pers_anxious", "NFutDiff_pers_complex", - "NFutDiff_val_obey", "NFutDiff_val_trad", "NFutDiff_val_opinion", "NFutDiff_val_performance", "NFutDiff_val_justice", - "NFutDiff_life_ideal", "NFutDiff_life_excellent", "NFutDiff_life_satisfied", "NFutDiff_life_important", "NFutDiff_life_change" -) - -# Define DGEN variables to average -dgen_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -# Calculate domain_mean as average of all 40 variables -exp1_data$domain_mean <- rowMeans(exp1_data[, all_diff_vars], na.rm = TRUE) - -# Calculate DGEN_mean as average of all 8 DGEN variables -exp1_data$DGEN_mean <- rowMeans(exp1_data[, dgen_vars], na.rm = TRUE) - -# Save the updated data -write.csv(exp1_data, "exp1.csv", row.names = FALSE) - -# Display summary of the calculated means -cat("Domain mean summary (average of all 40 NPastDiff and NFutDiff variables):\n") -summary(exp1_data$domain_mean) - -cat("\nDGEN mean summary (average of all 8 DGEN variables):\n") -summary(exp1_data$DGEN_mean) - -# Show first few rows to verify calculations -cat("\nFirst 5 rows of calculated means:\n") -print(exp1_data[1:5, c("domain_mean", "DGEN_mean")]) - -# Show the individual DGEN values for first 5 rows to verify math -cat("\nFirst 5 rows of individual DGEN values for verification:\n") -print(exp1_data[1:5, dgen_vars]) diff --git a/.history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008152448.r b/.history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008152448.r deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008152542.r b/.history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008152542.r deleted file mode 100644 index 08bcbc3..0000000 --- a/.history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008152542.r +++ /dev/null @@ -1,2 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") \ No newline at end of file diff --git a/.history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008153309.r b/.history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008153309.r deleted file mode 100644 index 8980819..0000000 --- a/.history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008153309.r +++ /dev/null @@ -1,104 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load data -ehi1 <- read.csv("ehi1.csv") - -# Create EHI difference variables (NPast - NFut) -# Preferences -ehi1$ehi_pref_read <- ehi1$NPastDiff_pref_read - ehi1$NFutDiff_pref_read -ehi1$ehi_pref_music <- ehi1$NPastDiff_pref_music - ehi1$NFutDiff_pref_music -ehi1$ehi_pref_tv <- ehi1$NPastDiff_pref_tv - ehi1$NFutDiff_pref_tv -ehi1$ehi_pref_nap <- ehi1$NPastDiff_pref_nap - ehi1$NFutDiff_pref_nap -ehi1$ehi_pref_travel <- ehi1$NPastDiff_pref_travel - ehi1$NFutDiff_pref_travel - -# Personality -ehi1$ehi_pers_extravert <- ehi1$NPastDiff_pers_extravert - ehi1$NFutDiff_pers_extravert -ehi1$ehi_pers_critical <- ehi1$NPastDiff_pers_critical - ehi1$NFutDiff_pers_critical -ehi1$ehi_pers_dependable <- ehi1$NPastDiff_pers_dependable - ehi1$NFutDiff_pers_dependable -ehi1$ehi_pers_anxious <- ehi1$NPastDiff_pers_anxious - ehi1$NFutDiff_pers_anxious -ehi1$ehi_pers_complex <- ehi1$NPastDiff_pers_complex - ehi1$NFutDiff_pers_complex - -# Values -ehi1$ehi_val_obey <- ehi1$NPastDiff_val_obey - ehi1$NFutDiff_val_obey -ehi1$ehi_val_trad <- ehi1$NPastDiff_val_trad - ehi1$NFutDiff_val_trad -ehi1$ehi_val_opinion <- ehi1$NPastDiff_val_opinion - ehi1$NFutDiff_val_opinion -ehi1$ehi_val_performance <- ehi1$NPastDiff_val_performance - ehi1$NFutDiff_val_performance -ehi1$ehi_val_justice <- ehi1$NPastDiff_val_justice - ehi1$NFutDiff_val_justice - -# Life satisfaction -ehi1$ehi_life_ideal <- ehi1$NPastDiff_life_ideal - ehi1$NFutDiff_life_ideal -ehi1$ehi_life_excellent <- ehi1$NPastDiff_life_excellent - ehi1$NFutDiff_life_excellent -ehi1$ehi_life_satisfied <- ehi1$NPastDiff_life_satisfied - ehi1$NFutDiff_life_satisfied -ehi1$ehi_life_important <- ehi1$NPastDiff_life_important - ehi1$NFutDiff_life_important -ehi1$ehi_life_change <- ehi1$NPastDiff_life_change - ehi1$NFutDiff_life_change - -# QA: Verify calculations -cat("\n=== QUALITY ASSURANCE CHECK ===\n") -cat("Verifying EHI difference calculations (NPast - NFut)\n\n") - -qa_pairs <- list( - list(npast = "NPastDiff_pref_read", nfut = "NFutDiff_pref_read", target = "ehi_pref_read"), - list(npast = "NPastDiff_pref_music", nfut = "NFutDiff_pref_music", target = "ehi_pref_music"), - list(npast = "NPastDiff_pref_tv", nfut = "NFutDiff_pref_tv", target = "ehi_pref_tv"), - list(npast = "NPastDiff_pref_nap", nfut = "NFutDiff_pref_nap", target = "ehi_pref_nap"), - list(npast = "NPastDiff_pref_travel", nfut = "NFutDiff_pref_travel", target = "ehi_pref_travel"), - list(npast = "NPastDiff_pers_extravert", nfut = "NFutDiff_pers_extravert", target = "ehi_pers_extravert"), - list(npast = "NPastDiff_pers_critical", nfut = "NFutDiff_pers_critical", target = "ehi_pers_critical"), - list(npast = "NPastDiff_pers_dependable", nfut = "NFutDiff_pers_dependable", target = "ehi_pers_dependable"), - list(npast = "NPastDiff_pers_anxious", nfut = "NFutDiff_pers_anxious", target = "ehi_pers_anxious"), - list(npast = "NPastDiff_pers_complex", nfut = "NFutDiff_pers_complex", target = "ehi_pers_complex"), - list(npast = "NPastDiff_val_obey", nfut = "NFutDiff_val_obey", target = "ehi_val_obey"), - list(npast = "NPastDiff_val_trad", nfut = "NFutDiff_val_trad", target = "ehi_val_trad"), - list(npast = "NPastDiff_val_opinion", nfut = "NFutDiff_val_opinion", target = "ehi_val_opinion"), - list(npast = "NPastDiff_val_performance", nfut = "NFutDiff_val_performance", target = "ehi_val_performance"), - list(npast = "NPastDiff_val_justice", nfut = "NFutDiff_val_justice", target = "ehi_val_justice"), - list(npast = "NPastDiff_life_ideal", nfut = "NFutDiff_life_ideal", target = "ehi_life_ideal"), - list(npast = "NPastDiff_life_excellent", nfut = "NFutDiff_life_excellent", target = "ehi_life_excellent"), - list(npast = "NPastDiff_life_satisfied", nfut = "NFutDiff_life_satisfied", target = "ehi_life_satisfied"), - list(npast = "NPastDiff_life_important", nfut = "NFutDiff_life_important", target = "ehi_life_important"), - list(npast = "NPastDiff_life_change", nfut = "NFutDiff_life_change", target = "ehi_life_change") -) - -all_checks_passed <- TRUE - -for (pair in qa_pairs) { - # Calculate expected difference - expected_diff <- ehi1[[pair$npast]] - ehi1[[pair$nfut]] - - # Get actual value in target variable - actual_value <- ehi1[[pair$target]] - - # Compare (allowing for floating point precision issues) - discrepancies <- which(abs(expected_diff - actual_value) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s\n", pair$target)) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - - # Show first discrepancy details - row_num <- discrepancies[1] - cat(sprintf(" Example (row %d): %s (%g) - %s (%g) = %g, but %s = %g\n", - row_num, - pair$npast, ehi1[[pair$npast]][row_num], - pair$nfut, ehi1[[pair$nfut]][row_num], - expected_diff[row_num], - pair$target, actual_value[row_num])) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s (n = %d)\n", pair$target, nrow(ehi1))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(ehi1, "ehi1.csv", row.names = FALSE) -cat("\nDataset saved to ehi1.csv\n") \ No newline at end of file diff --git a/.history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008153323.r b/.history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008153323.r deleted file mode 100644 index 8980819..0000000 --- a/.history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008153323.r +++ /dev/null @@ -1,104 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load data -ehi1 <- read.csv("ehi1.csv") - -# Create EHI difference variables (NPast - NFut) -# Preferences -ehi1$ehi_pref_read <- ehi1$NPastDiff_pref_read - ehi1$NFutDiff_pref_read -ehi1$ehi_pref_music <- ehi1$NPastDiff_pref_music - ehi1$NFutDiff_pref_music -ehi1$ehi_pref_tv <- ehi1$NPastDiff_pref_tv - ehi1$NFutDiff_pref_tv -ehi1$ehi_pref_nap <- ehi1$NPastDiff_pref_nap - ehi1$NFutDiff_pref_nap -ehi1$ehi_pref_travel <- ehi1$NPastDiff_pref_travel - ehi1$NFutDiff_pref_travel - -# Personality -ehi1$ehi_pers_extravert <- ehi1$NPastDiff_pers_extravert - ehi1$NFutDiff_pers_extravert -ehi1$ehi_pers_critical <- ehi1$NPastDiff_pers_critical - ehi1$NFutDiff_pers_critical -ehi1$ehi_pers_dependable <- ehi1$NPastDiff_pers_dependable - ehi1$NFutDiff_pers_dependable -ehi1$ehi_pers_anxious <- ehi1$NPastDiff_pers_anxious - ehi1$NFutDiff_pers_anxious -ehi1$ehi_pers_complex <- ehi1$NPastDiff_pers_complex - ehi1$NFutDiff_pers_complex - -# Values -ehi1$ehi_val_obey <- ehi1$NPastDiff_val_obey - ehi1$NFutDiff_val_obey -ehi1$ehi_val_trad <- ehi1$NPastDiff_val_trad - ehi1$NFutDiff_val_trad -ehi1$ehi_val_opinion <- ehi1$NPastDiff_val_opinion - ehi1$NFutDiff_val_opinion -ehi1$ehi_val_performance <- ehi1$NPastDiff_val_performance - ehi1$NFutDiff_val_performance -ehi1$ehi_val_justice <- ehi1$NPastDiff_val_justice - ehi1$NFutDiff_val_justice - -# Life satisfaction -ehi1$ehi_life_ideal <- ehi1$NPastDiff_life_ideal - ehi1$NFutDiff_life_ideal -ehi1$ehi_life_excellent <- ehi1$NPastDiff_life_excellent - ehi1$NFutDiff_life_excellent -ehi1$ehi_life_satisfied <- ehi1$NPastDiff_life_satisfied - ehi1$NFutDiff_life_satisfied -ehi1$ehi_life_important <- ehi1$NPastDiff_life_important - ehi1$NFutDiff_life_important -ehi1$ehi_life_change <- ehi1$NPastDiff_life_change - ehi1$NFutDiff_life_change - -# QA: Verify calculations -cat("\n=== QUALITY ASSURANCE CHECK ===\n") -cat("Verifying EHI difference calculations (NPast - NFut)\n\n") - -qa_pairs <- list( - list(npast = "NPastDiff_pref_read", nfut = "NFutDiff_pref_read", target = "ehi_pref_read"), - list(npast = "NPastDiff_pref_music", nfut = "NFutDiff_pref_music", target = "ehi_pref_music"), - list(npast = "NPastDiff_pref_tv", nfut = "NFutDiff_pref_tv", target = "ehi_pref_tv"), - list(npast = "NPastDiff_pref_nap", nfut = "NFutDiff_pref_nap", target = "ehi_pref_nap"), - list(npast = "NPastDiff_pref_travel", nfut = "NFutDiff_pref_travel", target = "ehi_pref_travel"), - list(npast = "NPastDiff_pers_extravert", nfut = "NFutDiff_pers_extravert", target = "ehi_pers_extravert"), - list(npast = "NPastDiff_pers_critical", nfut = "NFutDiff_pers_critical", target = "ehi_pers_critical"), - list(npast = "NPastDiff_pers_dependable", nfut = "NFutDiff_pers_dependable", target = "ehi_pers_dependable"), - list(npast = "NPastDiff_pers_anxious", nfut = "NFutDiff_pers_anxious", target = "ehi_pers_anxious"), - list(npast = "NPastDiff_pers_complex", nfut = "NFutDiff_pers_complex", target = "ehi_pers_complex"), - list(npast = "NPastDiff_val_obey", nfut = "NFutDiff_val_obey", target = "ehi_val_obey"), - list(npast = "NPastDiff_val_trad", nfut = "NFutDiff_val_trad", target = "ehi_val_trad"), - list(npast = "NPastDiff_val_opinion", nfut = "NFutDiff_val_opinion", target = "ehi_val_opinion"), - list(npast = "NPastDiff_val_performance", nfut = "NFutDiff_val_performance", target = "ehi_val_performance"), - list(npast = "NPastDiff_val_justice", nfut = "NFutDiff_val_justice", target = "ehi_val_justice"), - list(npast = "NPastDiff_life_ideal", nfut = "NFutDiff_life_ideal", target = "ehi_life_ideal"), - list(npast = "NPastDiff_life_excellent", nfut = "NFutDiff_life_excellent", target = "ehi_life_excellent"), - list(npast = "NPastDiff_life_satisfied", nfut = "NFutDiff_life_satisfied", target = "ehi_life_satisfied"), - list(npast = "NPastDiff_life_important", nfut = "NFutDiff_life_important", target = "ehi_life_important"), - list(npast = "NPastDiff_life_change", nfut = "NFutDiff_life_change", target = "ehi_life_change") -) - -all_checks_passed <- TRUE - -for (pair in qa_pairs) { - # Calculate expected difference - expected_diff <- ehi1[[pair$npast]] - ehi1[[pair$nfut]] - - # Get actual value in target variable - actual_value <- ehi1[[pair$target]] - - # Compare (allowing for floating point precision issues) - discrepancies <- which(abs(expected_diff - actual_value) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s\n", pair$target)) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - - # Show first discrepancy details - row_num <- discrepancies[1] - cat(sprintf(" Example (row %d): %s (%g) - %s (%g) = %g, but %s = %g\n", - row_num, - pair$npast, ehi1[[pair$npast]][row_num], - pair$nfut, ehi1[[pair$nfut]][row_num], - expected_diff[row_num], - pair$target, actual_value[row_num])) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s (n = %d)\n", pair$target, nrow(ehi1))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(ehi1, "ehi1.csv", row.names = FALSE) -cat("\nDataset saved to ehi1.csv\n") \ No newline at end of file diff --git a/.history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008153333.r b/.history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008153333.r deleted file mode 100644 index 8980819..0000000 --- a/.history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008153333.r +++ /dev/null @@ -1,104 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load data -ehi1 <- read.csv("ehi1.csv") - -# Create EHI difference variables (NPast - NFut) -# Preferences -ehi1$ehi_pref_read <- ehi1$NPastDiff_pref_read - ehi1$NFutDiff_pref_read -ehi1$ehi_pref_music <- ehi1$NPastDiff_pref_music - ehi1$NFutDiff_pref_music -ehi1$ehi_pref_tv <- ehi1$NPastDiff_pref_tv - ehi1$NFutDiff_pref_tv -ehi1$ehi_pref_nap <- ehi1$NPastDiff_pref_nap - ehi1$NFutDiff_pref_nap -ehi1$ehi_pref_travel <- ehi1$NPastDiff_pref_travel - ehi1$NFutDiff_pref_travel - -# Personality -ehi1$ehi_pers_extravert <- ehi1$NPastDiff_pers_extravert - ehi1$NFutDiff_pers_extravert -ehi1$ehi_pers_critical <- ehi1$NPastDiff_pers_critical - ehi1$NFutDiff_pers_critical -ehi1$ehi_pers_dependable <- ehi1$NPastDiff_pers_dependable - ehi1$NFutDiff_pers_dependable -ehi1$ehi_pers_anxious <- ehi1$NPastDiff_pers_anxious - ehi1$NFutDiff_pers_anxious -ehi1$ehi_pers_complex <- ehi1$NPastDiff_pers_complex - ehi1$NFutDiff_pers_complex - -# Values -ehi1$ehi_val_obey <- ehi1$NPastDiff_val_obey - ehi1$NFutDiff_val_obey -ehi1$ehi_val_trad <- ehi1$NPastDiff_val_trad - ehi1$NFutDiff_val_trad -ehi1$ehi_val_opinion <- ehi1$NPastDiff_val_opinion - ehi1$NFutDiff_val_opinion -ehi1$ehi_val_performance <- ehi1$NPastDiff_val_performance - ehi1$NFutDiff_val_performance -ehi1$ehi_val_justice <- ehi1$NPastDiff_val_justice - ehi1$NFutDiff_val_justice - -# Life satisfaction -ehi1$ehi_life_ideal <- ehi1$NPastDiff_life_ideal - ehi1$NFutDiff_life_ideal -ehi1$ehi_life_excellent <- ehi1$NPastDiff_life_excellent - ehi1$NFutDiff_life_excellent -ehi1$ehi_life_satisfied <- ehi1$NPastDiff_life_satisfied - ehi1$NFutDiff_life_satisfied -ehi1$ehi_life_important <- ehi1$NPastDiff_life_important - ehi1$NFutDiff_life_important -ehi1$ehi_life_change <- ehi1$NPastDiff_life_change - ehi1$NFutDiff_life_change - -# QA: Verify calculations -cat("\n=== QUALITY ASSURANCE CHECK ===\n") -cat("Verifying EHI difference calculations (NPast - NFut)\n\n") - -qa_pairs <- list( - list(npast = "NPastDiff_pref_read", nfut = "NFutDiff_pref_read", target = "ehi_pref_read"), - list(npast = "NPastDiff_pref_music", nfut = "NFutDiff_pref_music", target = "ehi_pref_music"), - list(npast = "NPastDiff_pref_tv", nfut = "NFutDiff_pref_tv", target = "ehi_pref_tv"), - list(npast = "NPastDiff_pref_nap", nfut = "NFutDiff_pref_nap", target = "ehi_pref_nap"), - list(npast = "NPastDiff_pref_travel", nfut = "NFutDiff_pref_travel", target = "ehi_pref_travel"), - list(npast = "NPastDiff_pers_extravert", nfut = "NFutDiff_pers_extravert", target = "ehi_pers_extravert"), - list(npast = "NPastDiff_pers_critical", nfut = "NFutDiff_pers_critical", target = "ehi_pers_critical"), - list(npast = "NPastDiff_pers_dependable", nfut = "NFutDiff_pers_dependable", target = "ehi_pers_dependable"), - list(npast = "NPastDiff_pers_anxious", nfut = "NFutDiff_pers_anxious", target = "ehi_pers_anxious"), - list(npast = "NPastDiff_pers_complex", nfut = "NFutDiff_pers_complex", target = "ehi_pers_complex"), - list(npast = "NPastDiff_val_obey", nfut = "NFutDiff_val_obey", target = "ehi_val_obey"), - list(npast = "NPastDiff_val_trad", nfut = "NFutDiff_val_trad", target = "ehi_val_trad"), - list(npast = "NPastDiff_val_opinion", nfut = "NFutDiff_val_opinion", target = "ehi_val_opinion"), - list(npast = "NPastDiff_val_performance", nfut = "NFutDiff_val_performance", target = "ehi_val_performance"), - list(npast = "NPastDiff_val_justice", nfut = "NFutDiff_val_justice", target = "ehi_val_justice"), - list(npast = "NPastDiff_life_ideal", nfut = "NFutDiff_life_ideal", target = "ehi_life_ideal"), - list(npast = "NPastDiff_life_excellent", nfut = "NFutDiff_life_excellent", target = "ehi_life_excellent"), - list(npast = "NPastDiff_life_satisfied", nfut = "NFutDiff_life_satisfied", target = "ehi_life_satisfied"), - list(npast = "NPastDiff_life_important", nfut = "NFutDiff_life_important", target = "ehi_life_important"), - list(npast = "NPastDiff_life_change", nfut = "NFutDiff_life_change", target = "ehi_life_change") -) - -all_checks_passed <- TRUE - -for (pair in qa_pairs) { - # Calculate expected difference - expected_diff <- ehi1[[pair$npast]] - ehi1[[pair$nfut]] - - # Get actual value in target variable - actual_value <- ehi1[[pair$target]] - - # Compare (allowing for floating point precision issues) - discrepancies <- which(abs(expected_diff - actual_value) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s\n", pair$target)) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - - # Show first discrepancy details - row_num <- discrepancies[1] - cat(sprintf(" Example (row %d): %s (%g) - %s (%g) = %g, but %s = %g\n", - row_num, - pair$npast, ehi1[[pair$npast]][row_num], - pair$nfut, ehi1[[pair$nfut]][row_num], - expected_diff[row_num], - pair$target, actual_value[row_num])) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s (n = %d)\n", pair$target, nrow(ehi1))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(ehi1, "ehi1.csv", row.names = FALSE) -cat("\nDataset saved to ehi1.csv\n") \ No newline at end of file diff --git a/.history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008153816.r b/.history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008153816.r deleted file mode 100644 index 8980819..0000000 --- a/.history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008153816.r +++ /dev/null @@ -1,104 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load data -ehi1 <- read.csv("ehi1.csv") - -# Create EHI difference variables (NPast - NFut) -# Preferences -ehi1$ehi_pref_read <- ehi1$NPastDiff_pref_read - ehi1$NFutDiff_pref_read -ehi1$ehi_pref_music <- ehi1$NPastDiff_pref_music - ehi1$NFutDiff_pref_music -ehi1$ehi_pref_tv <- ehi1$NPastDiff_pref_tv - ehi1$NFutDiff_pref_tv -ehi1$ehi_pref_nap <- ehi1$NPastDiff_pref_nap - ehi1$NFutDiff_pref_nap -ehi1$ehi_pref_travel <- ehi1$NPastDiff_pref_travel - ehi1$NFutDiff_pref_travel - -# Personality -ehi1$ehi_pers_extravert <- ehi1$NPastDiff_pers_extravert - ehi1$NFutDiff_pers_extravert -ehi1$ehi_pers_critical <- ehi1$NPastDiff_pers_critical - ehi1$NFutDiff_pers_critical -ehi1$ehi_pers_dependable <- ehi1$NPastDiff_pers_dependable - ehi1$NFutDiff_pers_dependable -ehi1$ehi_pers_anxious <- ehi1$NPastDiff_pers_anxious - ehi1$NFutDiff_pers_anxious -ehi1$ehi_pers_complex <- ehi1$NPastDiff_pers_complex - ehi1$NFutDiff_pers_complex - -# Values -ehi1$ehi_val_obey <- ehi1$NPastDiff_val_obey - ehi1$NFutDiff_val_obey -ehi1$ehi_val_trad <- ehi1$NPastDiff_val_trad - ehi1$NFutDiff_val_trad -ehi1$ehi_val_opinion <- ehi1$NPastDiff_val_opinion - ehi1$NFutDiff_val_opinion -ehi1$ehi_val_performance <- ehi1$NPastDiff_val_performance - ehi1$NFutDiff_val_performance -ehi1$ehi_val_justice <- ehi1$NPastDiff_val_justice - ehi1$NFutDiff_val_justice - -# Life satisfaction -ehi1$ehi_life_ideal <- ehi1$NPastDiff_life_ideal - ehi1$NFutDiff_life_ideal -ehi1$ehi_life_excellent <- ehi1$NPastDiff_life_excellent - ehi1$NFutDiff_life_excellent -ehi1$ehi_life_satisfied <- ehi1$NPastDiff_life_satisfied - ehi1$NFutDiff_life_satisfied -ehi1$ehi_life_important <- ehi1$NPastDiff_life_important - ehi1$NFutDiff_life_important -ehi1$ehi_life_change <- ehi1$NPastDiff_life_change - ehi1$NFutDiff_life_change - -# QA: Verify calculations -cat("\n=== QUALITY ASSURANCE CHECK ===\n") -cat("Verifying EHI difference calculations (NPast - NFut)\n\n") - -qa_pairs <- list( - list(npast = "NPastDiff_pref_read", nfut = "NFutDiff_pref_read", target = "ehi_pref_read"), - list(npast = "NPastDiff_pref_music", nfut = "NFutDiff_pref_music", target = "ehi_pref_music"), - list(npast = "NPastDiff_pref_tv", nfut = "NFutDiff_pref_tv", target = "ehi_pref_tv"), - list(npast = "NPastDiff_pref_nap", nfut = "NFutDiff_pref_nap", target = "ehi_pref_nap"), - list(npast = "NPastDiff_pref_travel", nfut = "NFutDiff_pref_travel", target = "ehi_pref_travel"), - list(npast = "NPastDiff_pers_extravert", nfut = "NFutDiff_pers_extravert", target = "ehi_pers_extravert"), - list(npast = "NPastDiff_pers_critical", nfut = "NFutDiff_pers_critical", target = "ehi_pers_critical"), - list(npast = "NPastDiff_pers_dependable", nfut = "NFutDiff_pers_dependable", target = "ehi_pers_dependable"), - list(npast = "NPastDiff_pers_anxious", nfut = "NFutDiff_pers_anxious", target = "ehi_pers_anxious"), - list(npast = "NPastDiff_pers_complex", nfut = "NFutDiff_pers_complex", target = "ehi_pers_complex"), - list(npast = "NPastDiff_val_obey", nfut = "NFutDiff_val_obey", target = "ehi_val_obey"), - list(npast = "NPastDiff_val_trad", nfut = "NFutDiff_val_trad", target = "ehi_val_trad"), - list(npast = "NPastDiff_val_opinion", nfut = "NFutDiff_val_opinion", target = "ehi_val_opinion"), - list(npast = "NPastDiff_val_performance", nfut = "NFutDiff_val_performance", target = "ehi_val_performance"), - list(npast = "NPastDiff_val_justice", nfut = "NFutDiff_val_justice", target = "ehi_val_justice"), - list(npast = "NPastDiff_life_ideal", nfut = "NFutDiff_life_ideal", target = "ehi_life_ideal"), - list(npast = "NPastDiff_life_excellent", nfut = "NFutDiff_life_excellent", target = "ehi_life_excellent"), - list(npast = "NPastDiff_life_satisfied", nfut = "NFutDiff_life_satisfied", target = "ehi_life_satisfied"), - list(npast = "NPastDiff_life_important", nfut = "NFutDiff_life_important", target = "ehi_life_important"), - list(npast = "NPastDiff_life_change", nfut = "NFutDiff_life_change", target = "ehi_life_change") -) - -all_checks_passed <- TRUE - -for (pair in qa_pairs) { - # Calculate expected difference - expected_diff <- ehi1[[pair$npast]] - ehi1[[pair$nfut]] - - # Get actual value in target variable - actual_value <- ehi1[[pair$target]] - - # Compare (allowing for floating point precision issues) - discrepancies <- which(abs(expected_diff - actual_value) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s\n", pair$target)) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - - # Show first discrepancy details - row_num <- discrepancies[1] - cat(sprintf(" Example (row %d): %s (%g) - %s (%g) = %g, but %s = %g\n", - row_num, - pair$npast, ehi1[[pair$npast]][row_num], - pair$nfut, ehi1[[pair$nfut]][row_num], - expected_diff[row_num], - pair$target, actual_value[row_num])) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s (n = %d)\n", pair$target, nrow(ehi1))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(ehi1, "ehi1.csv", row.names = FALSE) -cat("\nDataset saved to ehi1.csv\n") \ No newline at end of file diff --git a/.history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008162958.r b/.history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008162958.r deleted file mode 100644 index 8980819..0000000 --- a/.history/eohi1/datap 03 - CORRECT domain specific EHI vars_20251008162958.r +++ /dev/null @@ -1,104 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load data -ehi1 <- read.csv("ehi1.csv") - -# Create EHI difference variables (NPast - NFut) -# Preferences -ehi1$ehi_pref_read <- ehi1$NPastDiff_pref_read - ehi1$NFutDiff_pref_read -ehi1$ehi_pref_music <- ehi1$NPastDiff_pref_music - ehi1$NFutDiff_pref_music -ehi1$ehi_pref_tv <- ehi1$NPastDiff_pref_tv - ehi1$NFutDiff_pref_tv -ehi1$ehi_pref_nap <- ehi1$NPastDiff_pref_nap - ehi1$NFutDiff_pref_nap -ehi1$ehi_pref_travel <- ehi1$NPastDiff_pref_travel - ehi1$NFutDiff_pref_travel - -# Personality -ehi1$ehi_pers_extravert <- ehi1$NPastDiff_pers_extravert - ehi1$NFutDiff_pers_extravert -ehi1$ehi_pers_critical <- ehi1$NPastDiff_pers_critical - ehi1$NFutDiff_pers_critical -ehi1$ehi_pers_dependable <- ehi1$NPastDiff_pers_dependable - ehi1$NFutDiff_pers_dependable -ehi1$ehi_pers_anxious <- ehi1$NPastDiff_pers_anxious - ehi1$NFutDiff_pers_anxious -ehi1$ehi_pers_complex <- ehi1$NPastDiff_pers_complex - ehi1$NFutDiff_pers_complex - -# Values -ehi1$ehi_val_obey <- ehi1$NPastDiff_val_obey - ehi1$NFutDiff_val_obey -ehi1$ehi_val_trad <- ehi1$NPastDiff_val_trad - ehi1$NFutDiff_val_trad -ehi1$ehi_val_opinion <- ehi1$NPastDiff_val_opinion - ehi1$NFutDiff_val_opinion -ehi1$ehi_val_performance <- ehi1$NPastDiff_val_performance - ehi1$NFutDiff_val_performance -ehi1$ehi_val_justice <- ehi1$NPastDiff_val_justice - ehi1$NFutDiff_val_justice - -# Life satisfaction -ehi1$ehi_life_ideal <- ehi1$NPastDiff_life_ideal - ehi1$NFutDiff_life_ideal -ehi1$ehi_life_excellent <- ehi1$NPastDiff_life_excellent - ehi1$NFutDiff_life_excellent -ehi1$ehi_life_satisfied <- ehi1$NPastDiff_life_satisfied - ehi1$NFutDiff_life_satisfied -ehi1$ehi_life_important <- ehi1$NPastDiff_life_important - ehi1$NFutDiff_life_important -ehi1$ehi_life_change <- ehi1$NPastDiff_life_change - ehi1$NFutDiff_life_change - -# QA: Verify calculations -cat("\n=== QUALITY ASSURANCE CHECK ===\n") -cat("Verifying EHI difference calculations (NPast - NFut)\n\n") - -qa_pairs <- list( - list(npast = "NPastDiff_pref_read", nfut = "NFutDiff_pref_read", target = "ehi_pref_read"), - list(npast = "NPastDiff_pref_music", nfut = "NFutDiff_pref_music", target = "ehi_pref_music"), - list(npast = "NPastDiff_pref_tv", nfut = "NFutDiff_pref_tv", target = "ehi_pref_tv"), - list(npast = "NPastDiff_pref_nap", nfut = "NFutDiff_pref_nap", target = "ehi_pref_nap"), - list(npast = "NPastDiff_pref_travel", nfut = "NFutDiff_pref_travel", target = "ehi_pref_travel"), - list(npast = "NPastDiff_pers_extravert", nfut = "NFutDiff_pers_extravert", target = "ehi_pers_extravert"), - list(npast = "NPastDiff_pers_critical", nfut = "NFutDiff_pers_critical", target = "ehi_pers_critical"), - list(npast = "NPastDiff_pers_dependable", nfut = "NFutDiff_pers_dependable", target = "ehi_pers_dependable"), - list(npast = "NPastDiff_pers_anxious", nfut = "NFutDiff_pers_anxious", target = "ehi_pers_anxious"), - list(npast = "NPastDiff_pers_complex", nfut = "NFutDiff_pers_complex", target = "ehi_pers_complex"), - list(npast = "NPastDiff_val_obey", nfut = "NFutDiff_val_obey", target = "ehi_val_obey"), - list(npast = "NPastDiff_val_trad", nfut = "NFutDiff_val_trad", target = "ehi_val_trad"), - list(npast = "NPastDiff_val_opinion", nfut = "NFutDiff_val_opinion", target = "ehi_val_opinion"), - list(npast = "NPastDiff_val_performance", nfut = "NFutDiff_val_performance", target = "ehi_val_performance"), - list(npast = "NPastDiff_val_justice", nfut = "NFutDiff_val_justice", target = "ehi_val_justice"), - list(npast = "NPastDiff_life_ideal", nfut = "NFutDiff_life_ideal", target = "ehi_life_ideal"), - list(npast = "NPastDiff_life_excellent", nfut = "NFutDiff_life_excellent", target = "ehi_life_excellent"), - list(npast = "NPastDiff_life_satisfied", nfut = "NFutDiff_life_satisfied", target = "ehi_life_satisfied"), - list(npast = "NPastDiff_life_important", nfut = "NFutDiff_life_important", target = "ehi_life_important"), - list(npast = "NPastDiff_life_change", nfut = "NFutDiff_life_change", target = "ehi_life_change") -) - -all_checks_passed <- TRUE - -for (pair in qa_pairs) { - # Calculate expected difference - expected_diff <- ehi1[[pair$npast]] - ehi1[[pair$nfut]] - - # Get actual value in target variable - actual_value <- ehi1[[pair$target]] - - # Compare (allowing for floating point precision issues) - discrepancies <- which(abs(expected_diff - actual_value) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s\n", pair$target)) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - - # Show first discrepancy details - row_num <- discrepancies[1] - cat(sprintf(" Example (row %d): %s (%g) - %s (%g) = %g, but %s = %g\n", - row_num, - pair$npast, ehi1[[pair$npast]][row_num], - pair$nfut, ehi1[[pair$nfut]][row_num], - expected_diff[row_num], - pair$target, actual_value[row_num])) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s (n = %d)\n", pair$target, nrow(ehi1))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(ehi1, "ehi1.csv", row.names = FALSE) -cat("\nDataset saved to ehi1.csv\n") \ No newline at end of file diff --git a/.history/eohi1/datap 04 - CORRECT ehi var means_20251008153939.txt b/.history/eohi1/datap 04 - CORRECT ehi var means_20251008153939.txt deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi1/datap 04 - CORRECT ehi var means_20251008153946.txt b/.history/eohi1/datap 04 - CORRECT ehi var means_20251008153946.txt deleted file mode 100644 index 931ce4f..0000000 --- a/.history/eohi1/datap 04 - CORRECT ehi var means_20251008153946.txt +++ /dev/null @@ -1,5 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load data -ehi1 <- read.csv("ehi1.csv") \ No newline at end of file diff --git a/.history/eohi1/datap 04 - CORRECT ehi var means_20251008153958.r b/.history/eohi1/datap 04 - CORRECT ehi var means_20251008153958.r deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi1/datap 04 - CORRECT ehi var means_20251008153959.r b/.history/eohi1/datap 04 - CORRECT ehi var means_20251008153959.r deleted file mode 100644 index 931ce4f..0000000 --- a/.history/eohi1/datap 04 - CORRECT ehi var means_20251008153959.r +++ /dev/null @@ -1,5 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load data -ehi1 <- read.csv("ehi1.csv") \ No newline at end of file diff --git a/.history/eohi1/datap 04 - CORRECT ehi var means_20251008154513.r b/.history/eohi1/datap 04 - CORRECT ehi var means_20251008154513.r deleted file mode 100644 index bcce600..0000000 --- a/.history/eohi1/datap 04 - CORRECT ehi var means_20251008154513.r +++ /dev/null @@ -1,167 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load data -ehi1 <- read.csv("ehi1.csv") - -# Calculate mean scores for EHI variables - -# 1. Preferences mean -ehi1$ehi_pref_mean <- rowMeans(ehi1[, c("ehi_pref_read", "ehi_pref_music", - "ehi_pref_tv", "ehi_pref_nap", - "ehi_pref_travel")], na.rm = TRUE) - -# 2. Personality mean -ehi1$ehi_pers_mean <- rowMeans(ehi1[, c("ehi_pers_extravert", "ehi_pers_critical", - "ehi_pers_dependable", "ehi_pers_anxious", - "ehi_pers_complex")], na.rm = TRUE) - -# 3. Values mean -ehi1$ehi_val_mean <- rowMeans(ehi1[, c("ehi_val_obey", "ehi_val_trad", - "ehi_val_opinion", "ehi_val_performance", - "ehi_val_justice")], na.rm = TRUE) - -# 4. Life satisfaction mean -ehi1$ehi_life_mean <- rowMeans(ehi1[, c("ehi_life_ideal", "ehi_life_excellent", - "ehi_life_satisfied", "ehi_life_important", - "ehi_life_change")], na.rm = TRUE) - -# 5. Global mean (all 20 variables) -ehi1$ehi_global_mean <- rowMeans(ehi1[, c("ehi_pref_read", "ehi_pref_music", - "ehi_pref_tv", "ehi_pref_nap", - "ehi_pref_travel", - "ehi_pers_extravert", "ehi_pers_critical", - "ehi_pers_dependable", "ehi_pers_anxious", - "ehi_pers_complex", - "ehi_val_obey", "ehi_val_trad", - "ehi_val_opinion", "ehi_val_performance", - "ehi_val_justice", - "ehi_life_ideal", "ehi_life_excellent", - "ehi_life_satisfied", "ehi_life_important", - "ehi_life_change")], na.rm = TRUE) - -# QA: Verify mean calculations -cat("\n=== QUALITY ASSURANCE CHECK ===\n") -cat("Verifying EHI mean calculations\n\n") - -cat("--- FIRST 5 ROWS: PREFERENCES MEAN ---\n") -for (i in 1:5) { - vals <- c(ehi1$ehi_pref_read[i], ehi1$ehi_pref_music[i], - ehi1$ehi_pref_tv[i], ehi1$ehi_pref_nap[i], - ehi1$ehi_pref_travel[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- ehi1$ehi_pref_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: PERSONALITY MEAN ---\n") -for (i in 1:5) { - vals <- c(ehi1$ehi_pers_extravert[i], ehi1$ehi_pers_critical[i], - ehi1$ehi_pers_dependable[i], ehi1$ehi_pers_anxious[i], - ehi1$ehi_pers_complex[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- ehi1$ehi_pers_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: VALUES MEAN ---\n") -for (i in 1:5) { - vals <- c(ehi1$ehi_val_obey[i], ehi1$ehi_val_trad[i], - ehi1$ehi_val_opinion[i], ehi1$ehi_val_performance[i], - ehi1$ehi_val_justice[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- ehi1$ehi_val_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: LIFE SATISFACTION MEAN ---\n") -for (i in 1:5) { - vals <- c(ehi1$ehi_life_ideal[i], ehi1$ehi_life_excellent[i], - ehi1$ehi_life_satisfied[i], ehi1$ehi_life_important[i], - ehi1$ehi_life_change[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- ehi1$ehi_life_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: GLOBAL MEAN (20 variables) ---\n") -for (i in 1:5) { - vals <- c(ehi1$ehi_pref_read[i], ehi1$ehi_pref_music[i], - ehi1$ehi_pref_tv[i], ehi1$ehi_pref_nap[i], - ehi1$ehi_pref_travel[i], - ehi1$ehi_pers_extravert[i], ehi1$ehi_pers_critical[i], - ehi1$ehi_pers_dependable[i], ehi1$ehi_pers_anxious[i], - ehi1$ehi_pers_complex[i], - ehi1$ehi_val_obey[i], ehi1$ehi_val_trad[i], - ehi1$ehi_val_opinion[i], ehi1$ehi_val_performance[i], - ehi1$ehi_val_justice[i], - ehi1$ehi_life_ideal[i], ehi1$ehi_life_excellent[i], - ehi1$ehi_life_satisfied[i], ehi1$ehi_life_important[i], - ehi1$ehi_life_change[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- ehi1$ehi_global_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: 20 values → Calculated: %.5f | Actual: %.5f %s\n", - i, calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -# Overall QA check for all rows -cat("\n--- OVERALL QA CHECK (ALL ROWS) ---\n") - -qa_checks <- list( - list(vars = c("ehi_pref_read", "ehi_pref_music", "ehi_pref_tv", "ehi_pref_nap", "ehi_pref_travel"), - target = "ehi_pref_mean", name = "Preferences"), - list(vars = c("ehi_pers_extravert", "ehi_pers_critical", "ehi_pers_dependable", "ehi_pers_anxious", "ehi_pers_complex"), - target = "ehi_pers_mean", name = "Personality"), - list(vars = c("ehi_val_obey", "ehi_val_trad", "ehi_val_opinion", "ehi_val_performance", "ehi_val_justice"), - target = "ehi_val_mean", name = "Values"), - list(vars = c("ehi_life_ideal", "ehi_life_excellent", "ehi_life_satisfied", "ehi_life_important", "ehi_life_change"), - target = "ehi_life_mean", name = "Life Satisfaction"), - list(vars = c("ehi_pref_read", "ehi_pref_music", "ehi_pref_tv", "ehi_pref_nap", "ehi_pref_travel", - "ehi_pers_extravert", "ehi_pers_critical", "ehi_pers_dependable", "ehi_pers_anxious", "ehi_pers_complex", - "ehi_val_obey", "ehi_val_trad", "ehi_val_opinion", "ehi_val_performance", "ehi_val_justice", - "ehi_life_ideal", "ehi_life_excellent", "ehi_life_satisfied", "ehi_life_important", "ehi_life_change"), - target = "ehi_global_mean", name = "Global") -) - -all_checks_passed <- TRUE - -for (check in qa_checks) { - calc_mean <- rowMeans(ehi1[, check$vars], na.rm = TRUE) - actual_mean <- ehi1[[check$target]] - discrepancies <- which(abs(calc_mean - actual_mean) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s mean (n_vars = %d)\n", check$name, length(check$vars))) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s mean (n_vars = %d, n_rows = %d)\n", - check$name, length(check$vars), nrow(ehi1))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(ehi1, "ehi1.csv", row.names = FALSE) -cat("\nDataset saved to ehi1.csv\n") \ No newline at end of file diff --git a/.history/eohi1/datap 04 - CORRECT ehi var means_20251008154517.r b/.history/eohi1/datap 04 - CORRECT ehi var means_20251008154517.r deleted file mode 100644 index bcce600..0000000 --- a/.history/eohi1/datap 04 - CORRECT ehi var means_20251008154517.r +++ /dev/null @@ -1,167 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load data -ehi1 <- read.csv("ehi1.csv") - -# Calculate mean scores for EHI variables - -# 1. Preferences mean -ehi1$ehi_pref_mean <- rowMeans(ehi1[, c("ehi_pref_read", "ehi_pref_music", - "ehi_pref_tv", "ehi_pref_nap", - "ehi_pref_travel")], na.rm = TRUE) - -# 2. Personality mean -ehi1$ehi_pers_mean <- rowMeans(ehi1[, c("ehi_pers_extravert", "ehi_pers_critical", - "ehi_pers_dependable", "ehi_pers_anxious", - "ehi_pers_complex")], na.rm = TRUE) - -# 3. Values mean -ehi1$ehi_val_mean <- rowMeans(ehi1[, c("ehi_val_obey", "ehi_val_trad", - "ehi_val_opinion", "ehi_val_performance", - "ehi_val_justice")], na.rm = TRUE) - -# 4. Life satisfaction mean -ehi1$ehi_life_mean <- rowMeans(ehi1[, c("ehi_life_ideal", "ehi_life_excellent", - "ehi_life_satisfied", "ehi_life_important", - "ehi_life_change")], na.rm = TRUE) - -# 5. Global mean (all 20 variables) -ehi1$ehi_global_mean <- rowMeans(ehi1[, c("ehi_pref_read", "ehi_pref_music", - "ehi_pref_tv", "ehi_pref_nap", - "ehi_pref_travel", - "ehi_pers_extravert", "ehi_pers_critical", - "ehi_pers_dependable", "ehi_pers_anxious", - "ehi_pers_complex", - "ehi_val_obey", "ehi_val_trad", - "ehi_val_opinion", "ehi_val_performance", - "ehi_val_justice", - "ehi_life_ideal", "ehi_life_excellent", - "ehi_life_satisfied", "ehi_life_important", - "ehi_life_change")], na.rm = TRUE) - -# QA: Verify mean calculations -cat("\n=== QUALITY ASSURANCE CHECK ===\n") -cat("Verifying EHI mean calculations\n\n") - -cat("--- FIRST 5 ROWS: PREFERENCES MEAN ---\n") -for (i in 1:5) { - vals <- c(ehi1$ehi_pref_read[i], ehi1$ehi_pref_music[i], - ehi1$ehi_pref_tv[i], ehi1$ehi_pref_nap[i], - ehi1$ehi_pref_travel[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- ehi1$ehi_pref_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: PERSONALITY MEAN ---\n") -for (i in 1:5) { - vals <- c(ehi1$ehi_pers_extravert[i], ehi1$ehi_pers_critical[i], - ehi1$ehi_pers_dependable[i], ehi1$ehi_pers_anxious[i], - ehi1$ehi_pers_complex[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- ehi1$ehi_pers_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: VALUES MEAN ---\n") -for (i in 1:5) { - vals <- c(ehi1$ehi_val_obey[i], ehi1$ehi_val_trad[i], - ehi1$ehi_val_opinion[i], ehi1$ehi_val_performance[i], - ehi1$ehi_val_justice[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- ehi1$ehi_val_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: LIFE SATISFACTION MEAN ---\n") -for (i in 1:5) { - vals <- c(ehi1$ehi_life_ideal[i], ehi1$ehi_life_excellent[i], - ehi1$ehi_life_satisfied[i], ehi1$ehi_life_important[i], - ehi1$ehi_life_change[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- ehi1$ehi_life_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: GLOBAL MEAN (20 variables) ---\n") -for (i in 1:5) { - vals <- c(ehi1$ehi_pref_read[i], ehi1$ehi_pref_music[i], - ehi1$ehi_pref_tv[i], ehi1$ehi_pref_nap[i], - ehi1$ehi_pref_travel[i], - ehi1$ehi_pers_extravert[i], ehi1$ehi_pers_critical[i], - ehi1$ehi_pers_dependable[i], ehi1$ehi_pers_anxious[i], - ehi1$ehi_pers_complex[i], - ehi1$ehi_val_obey[i], ehi1$ehi_val_trad[i], - ehi1$ehi_val_opinion[i], ehi1$ehi_val_performance[i], - ehi1$ehi_val_justice[i], - ehi1$ehi_life_ideal[i], ehi1$ehi_life_excellent[i], - ehi1$ehi_life_satisfied[i], ehi1$ehi_life_important[i], - ehi1$ehi_life_change[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- ehi1$ehi_global_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: 20 values → Calculated: %.5f | Actual: %.5f %s\n", - i, calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -# Overall QA check for all rows -cat("\n--- OVERALL QA CHECK (ALL ROWS) ---\n") - -qa_checks <- list( - list(vars = c("ehi_pref_read", "ehi_pref_music", "ehi_pref_tv", "ehi_pref_nap", "ehi_pref_travel"), - target = "ehi_pref_mean", name = "Preferences"), - list(vars = c("ehi_pers_extravert", "ehi_pers_critical", "ehi_pers_dependable", "ehi_pers_anxious", "ehi_pers_complex"), - target = "ehi_pers_mean", name = "Personality"), - list(vars = c("ehi_val_obey", "ehi_val_trad", "ehi_val_opinion", "ehi_val_performance", "ehi_val_justice"), - target = "ehi_val_mean", name = "Values"), - list(vars = c("ehi_life_ideal", "ehi_life_excellent", "ehi_life_satisfied", "ehi_life_important", "ehi_life_change"), - target = "ehi_life_mean", name = "Life Satisfaction"), - list(vars = c("ehi_pref_read", "ehi_pref_music", "ehi_pref_tv", "ehi_pref_nap", "ehi_pref_travel", - "ehi_pers_extravert", "ehi_pers_critical", "ehi_pers_dependable", "ehi_pers_anxious", "ehi_pers_complex", - "ehi_val_obey", "ehi_val_trad", "ehi_val_opinion", "ehi_val_performance", "ehi_val_justice", - "ehi_life_ideal", "ehi_life_excellent", "ehi_life_satisfied", "ehi_life_important", "ehi_life_change"), - target = "ehi_global_mean", name = "Global") -) - -all_checks_passed <- TRUE - -for (check in qa_checks) { - calc_mean <- rowMeans(ehi1[, check$vars], na.rm = TRUE) - actual_mean <- ehi1[[check$target]] - discrepancies <- which(abs(calc_mean - actual_mean) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s mean (n_vars = %d)\n", check$name, length(check$vars))) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s mean (n_vars = %d, n_rows = %d)\n", - check$name, length(check$vars), nrow(ehi1))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(ehi1, "ehi1.csv", row.names = FALSE) -cat("\nDataset saved to ehi1.csv\n") \ No newline at end of file diff --git a/.history/eohi1/datap 04 - CORRECT ehi var means_20251008154531.r b/.history/eohi1/datap 04 - CORRECT ehi var means_20251008154531.r deleted file mode 100644 index bcce600..0000000 --- a/.history/eohi1/datap 04 - CORRECT ehi var means_20251008154531.r +++ /dev/null @@ -1,167 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load data -ehi1 <- read.csv("ehi1.csv") - -# Calculate mean scores for EHI variables - -# 1. Preferences mean -ehi1$ehi_pref_mean <- rowMeans(ehi1[, c("ehi_pref_read", "ehi_pref_music", - "ehi_pref_tv", "ehi_pref_nap", - "ehi_pref_travel")], na.rm = TRUE) - -# 2. Personality mean -ehi1$ehi_pers_mean <- rowMeans(ehi1[, c("ehi_pers_extravert", "ehi_pers_critical", - "ehi_pers_dependable", "ehi_pers_anxious", - "ehi_pers_complex")], na.rm = TRUE) - -# 3. Values mean -ehi1$ehi_val_mean <- rowMeans(ehi1[, c("ehi_val_obey", "ehi_val_trad", - "ehi_val_opinion", "ehi_val_performance", - "ehi_val_justice")], na.rm = TRUE) - -# 4. Life satisfaction mean -ehi1$ehi_life_mean <- rowMeans(ehi1[, c("ehi_life_ideal", "ehi_life_excellent", - "ehi_life_satisfied", "ehi_life_important", - "ehi_life_change")], na.rm = TRUE) - -# 5. Global mean (all 20 variables) -ehi1$ehi_global_mean <- rowMeans(ehi1[, c("ehi_pref_read", "ehi_pref_music", - "ehi_pref_tv", "ehi_pref_nap", - "ehi_pref_travel", - "ehi_pers_extravert", "ehi_pers_critical", - "ehi_pers_dependable", "ehi_pers_anxious", - "ehi_pers_complex", - "ehi_val_obey", "ehi_val_trad", - "ehi_val_opinion", "ehi_val_performance", - "ehi_val_justice", - "ehi_life_ideal", "ehi_life_excellent", - "ehi_life_satisfied", "ehi_life_important", - "ehi_life_change")], na.rm = TRUE) - -# QA: Verify mean calculations -cat("\n=== QUALITY ASSURANCE CHECK ===\n") -cat("Verifying EHI mean calculations\n\n") - -cat("--- FIRST 5 ROWS: PREFERENCES MEAN ---\n") -for (i in 1:5) { - vals <- c(ehi1$ehi_pref_read[i], ehi1$ehi_pref_music[i], - ehi1$ehi_pref_tv[i], ehi1$ehi_pref_nap[i], - ehi1$ehi_pref_travel[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- ehi1$ehi_pref_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: PERSONALITY MEAN ---\n") -for (i in 1:5) { - vals <- c(ehi1$ehi_pers_extravert[i], ehi1$ehi_pers_critical[i], - ehi1$ehi_pers_dependable[i], ehi1$ehi_pers_anxious[i], - ehi1$ehi_pers_complex[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- ehi1$ehi_pers_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: VALUES MEAN ---\n") -for (i in 1:5) { - vals <- c(ehi1$ehi_val_obey[i], ehi1$ehi_val_trad[i], - ehi1$ehi_val_opinion[i], ehi1$ehi_val_performance[i], - ehi1$ehi_val_justice[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- ehi1$ehi_val_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: LIFE SATISFACTION MEAN ---\n") -for (i in 1:5) { - vals <- c(ehi1$ehi_life_ideal[i], ehi1$ehi_life_excellent[i], - ehi1$ehi_life_satisfied[i], ehi1$ehi_life_important[i], - ehi1$ehi_life_change[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- ehi1$ehi_life_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: GLOBAL MEAN (20 variables) ---\n") -for (i in 1:5) { - vals <- c(ehi1$ehi_pref_read[i], ehi1$ehi_pref_music[i], - ehi1$ehi_pref_tv[i], ehi1$ehi_pref_nap[i], - ehi1$ehi_pref_travel[i], - ehi1$ehi_pers_extravert[i], ehi1$ehi_pers_critical[i], - ehi1$ehi_pers_dependable[i], ehi1$ehi_pers_anxious[i], - ehi1$ehi_pers_complex[i], - ehi1$ehi_val_obey[i], ehi1$ehi_val_trad[i], - ehi1$ehi_val_opinion[i], ehi1$ehi_val_performance[i], - ehi1$ehi_val_justice[i], - ehi1$ehi_life_ideal[i], ehi1$ehi_life_excellent[i], - ehi1$ehi_life_satisfied[i], ehi1$ehi_life_important[i], - ehi1$ehi_life_change[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- ehi1$ehi_global_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: 20 values → Calculated: %.5f | Actual: %.5f %s\n", - i, calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -# Overall QA check for all rows -cat("\n--- OVERALL QA CHECK (ALL ROWS) ---\n") - -qa_checks <- list( - list(vars = c("ehi_pref_read", "ehi_pref_music", "ehi_pref_tv", "ehi_pref_nap", "ehi_pref_travel"), - target = "ehi_pref_mean", name = "Preferences"), - list(vars = c("ehi_pers_extravert", "ehi_pers_critical", "ehi_pers_dependable", "ehi_pers_anxious", "ehi_pers_complex"), - target = "ehi_pers_mean", name = "Personality"), - list(vars = c("ehi_val_obey", "ehi_val_trad", "ehi_val_opinion", "ehi_val_performance", "ehi_val_justice"), - target = "ehi_val_mean", name = "Values"), - list(vars = c("ehi_life_ideal", "ehi_life_excellent", "ehi_life_satisfied", "ehi_life_important", "ehi_life_change"), - target = "ehi_life_mean", name = "Life Satisfaction"), - list(vars = c("ehi_pref_read", "ehi_pref_music", "ehi_pref_tv", "ehi_pref_nap", "ehi_pref_travel", - "ehi_pers_extravert", "ehi_pers_critical", "ehi_pers_dependable", "ehi_pers_anxious", "ehi_pers_complex", - "ehi_val_obey", "ehi_val_trad", "ehi_val_opinion", "ehi_val_performance", "ehi_val_justice", - "ehi_life_ideal", "ehi_life_excellent", "ehi_life_satisfied", "ehi_life_important", "ehi_life_change"), - target = "ehi_global_mean", name = "Global") -) - -all_checks_passed <- TRUE - -for (check in qa_checks) { - calc_mean <- rowMeans(ehi1[, check$vars], na.rm = TRUE) - actual_mean <- ehi1[[check$target]] - discrepancies <- which(abs(calc_mean - actual_mean) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s mean (n_vars = %d)\n", check$name, length(check$vars))) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s mean (n_vars = %d, n_rows = %d)\n", - check$name, length(check$vars), nrow(ehi1))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(ehi1, "ehi1.csv", row.names = FALSE) -cat("\nDataset saved to ehi1.csv\n") \ No newline at end of file diff --git a/.history/eohi1/datap 04 - CORRECT ehi var means_20251008162957.r b/.history/eohi1/datap 04 - CORRECT ehi var means_20251008162957.r deleted file mode 100644 index bcce600..0000000 --- a/.history/eohi1/datap 04 - CORRECT ehi var means_20251008162957.r +++ /dev/null @@ -1,167 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Load data -ehi1 <- read.csv("ehi1.csv") - -# Calculate mean scores for EHI variables - -# 1. Preferences mean -ehi1$ehi_pref_mean <- rowMeans(ehi1[, c("ehi_pref_read", "ehi_pref_music", - "ehi_pref_tv", "ehi_pref_nap", - "ehi_pref_travel")], na.rm = TRUE) - -# 2. Personality mean -ehi1$ehi_pers_mean <- rowMeans(ehi1[, c("ehi_pers_extravert", "ehi_pers_critical", - "ehi_pers_dependable", "ehi_pers_anxious", - "ehi_pers_complex")], na.rm = TRUE) - -# 3. Values mean -ehi1$ehi_val_mean <- rowMeans(ehi1[, c("ehi_val_obey", "ehi_val_trad", - "ehi_val_opinion", "ehi_val_performance", - "ehi_val_justice")], na.rm = TRUE) - -# 4. Life satisfaction mean -ehi1$ehi_life_mean <- rowMeans(ehi1[, c("ehi_life_ideal", "ehi_life_excellent", - "ehi_life_satisfied", "ehi_life_important", - "ehi_life_change")], na.rm = TRUE) - -# 5. Global mean (all 20 variables) -ehi1$ehi_global_mean <- rowMeans(ehi1[, c("ehi_pref_read", "ehi_pref_music", - "ehi_pref_tv", "ehi_pref_nap", - "ehi_pref_travel", - "ehi_pers_extravert", "ehi_pers_critical", - "ehi_pers_dependable", "ehi_pers_anxious", - "ehi_pers_complex", - "ehi_val_obey", "ehi_val_trad", - "ehi_val_opinion", "ehi_val_performance", - "ehi_val_justice", - "ehi_life_ideal", "ehi_life_excellent", - "ehi_life_satisfied", "ehi_life_important", - "ehi_life_change")], na.rm = TRUE) - -# QA: Verify mean calculations -cat("\n=== QUALITY ASSURANCE CHECK ===\n") -cat("Verifying EHI mean calculations\n\n") - -cat("--- FIRST 5 ROWS: PREFERENCES MEAN ---\n") -for (i in 1:5) { - vals <- c(ehi1$ehi_pref_read[i], ehi1$ehi_pref_music[i], - ehi1$ehi_pref_tv[i], ehi1$ehi_pref_nap[i], - ehi1$ehi_pref_travel[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- ehi1$ehi_pref_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: PERSONALITY MEAN ---\n") -for (i in 1:5) { - vals <- c(ehi1$ehi_pers_extravert[i], ehi1$ehi_pers_critical[i], - ehi1$ehi_pers_dependable[i], ehi1$ehi_pers_anxious[i], - ehi1$ehi_pers_complex[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- ehi1$ehi_pers_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: VALUES MEAN ---\n") -for (i in 1:5) { - vals <- c(ehi1$ehi_val_obey[i], ehi1$ehi_val_trad[i], - ehi1$ehi_val_opinion[i], ehi1$ehi_val_performance[i], - ehi1$ehi_val_justice[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- ehi1$ehi_val_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: LIFE SATISFACTION MEAN ---\n") -for (i in 1:5) { - vals <- c(ehi1$ehi_life_ideal[i], ehi1$ehi_life_excellent[i], - ehi1$ehi_life_satisfied[i], ehi1$ehi_life_important[i], - ehi1$ehi_life_change[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- ehi1$ehi_life_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: GLOBAL MEAN (20 variables) ---\n") -for (i in 1:5) { - vals <- c(ehi1$ehi_pref_read[i], ehi1$ehi_pref_music[i], - ehi1$ehi_pref_tv[i], ehi1$ehi_pref_nap[i], - ehi1$ehi_pref_travel[i], - ehi1$ehi_pers_extravert[i], ehi1$ehi_pers_critical[i], - ehi1$ehi_pers_dependable[i], ehi1$ehi_pers_anxious[i], - ehi1$ehi_pers_complex[i], - ehi1$ehi_val_obey[i], ehi1$ehi_val_trad[i], - ehi1$ehi_val_opinion[i], ehi1$ehi_val_performance[i], - ehi1$ehi_val_justice[i], - ehi1$ehi_life_ideal[i], ehi1$ehi_life_excellent[i], - ehi1$ehi_life_satisfied[i], ehi1$ehi_life_important[i], - ehi1$ehi_life_change[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- ehi1$ehi_global_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: 20 values → Calculated: %.5f | Actual: %.5f %s\n", - i, calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -# Overall QA check for all rows -cat("\n--- OVERALL QA CHECK (ALL ROWS) ---\n") - -qa_checks <- list( - list(vars = c("ehi_pref_read", "ehi_pref_music", "ehi_pref_tv", "ehi_pref_nap", "ehi_pref_travel"), - target = "ehi_pref_mean", name = "Preferences"), - list(vars = c("ehi_pers_extravert", "ehi_pers_critical", "ehi_pers_dependable", "ehi_pers_anxious", "ehi_pers_complex"), - target = "ehi_pers_mean", name = "Personality"), - list(vars = c("ehi_val_obey", "ehi_val_trad", "ehi_val_opinion", "ehi_val_performance", "ehi_val_justice"), - target = "ehi_val_mean", name = "Values"), - list(vars = c("ehi_life_ideal", "ehi_life_excellent", "ehi_life_satisfied", "ehi_life_important", "ehi_life_change"), - target = "ehi_life_mean", name = "Life Satisfaction"), - list(vars = c("ehi_pref_read", "ehi_pref_music", "ehi_pref_tv", "ehi_pref_nap", "ehi_pref_travel", - "ehi_pers_extravert", "ehi_pers_critical", "ehi_pers_dependable", "ehi_pers_anxious", "ehi_pers_complex", - "ehi_val_obey", "ehi_val_trad", "ehi_val_opinion", "ehi_val_performance", "ehi_val_justice", - "ehi_life_ideal", "ehi_life_excellent", "ehi_life_satisfied", "ehi_life_important", "ehi_life_change"), - target = "ehi_global_mean", name = "Global") -) - -all_checks_passed <- TRUE - -for (check in qa_checks) { - calc_mean <- rowMeans(ehi1[, check$vars], na.rm = TRUE) - actual_mean <- ehi1[[check$target]] - discrepancies <- which(abs(calc_mean - actual_mean) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s mean (n_vars = %d)\n", check$name, length(check$vars))) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s mean (n_vars = %d, n_rows = %d)\n", - check$name, length(check$vars), nrow(ehi1))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(ehi1, "ehi1.csv", row.names = FALSE) -cat("\nDataset saved to ehi1.csv\n") \ No newline at end of file diff --git a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113023.r b/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113023.r deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113024.r b/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113024.r deleted file mode 100644 index d3699f5..0000000 --- a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113024.r +++ /dev/null @@ -1,5 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -data <- read.csv("ehi1.csv") \ No newline at end of file diff --git a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113338.r b/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113338.r deleted file mode 100644 index 8db7693..0000000 --- a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113338.r +++ /dev/null @@ -1,13 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -data <- read.csv("ehi1.csv") - -# Check the levels of the demo_edu variable -print("Current levels of demo_edu variable:") -print(levels(factor(data$demo_edu))) - -# Also show the unique values and their frequencies -print("\nUnique values and frequencies:") -print(table(data$demo_edu, useNA = "ifany")) \ No newline at end of file diff --git a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113347.r b/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113347.r deleted file mode 100644 index 8db7693..0000000 --- a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113347.r +++ /dev/null @@ -1,13 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -data <- read.csv("ehi1.csv") - -# Check the levels of the demo_edu variable -print("Current levels of demo_edu variable:") -print(levels(factor(data$demo_edu))) - -# Also show the unique values and their frequencies -print("\nUnique values and frequencies:") -print(table(data$demo_edu, useNA = "ifany")) \ No newline at end of file diff --git a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113517.r b/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113517.r deleted file mode 100644 index e9de5fa..0000000 --- a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113517.r +++ /dev/null @@ -1,12 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -data <- read.csv("ehi1.csv") - -# Check the levels of the demo_edu variable -print(levels(factor(data$demo_edu))) - -# Also show the unique values and their frequencies -print("\nUnique values and frequencies:") -print(table(data$demo_edu, useNA = "ifany")) \ No newline at end of file diff --git a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113654.r b/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113654.r deleted file mode 100644 index 5949cfd..0000000 --- a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113654.r +++ /dev/null @@ -1,37 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -data <- read.csv("ehi1.csv") - -# Check the levels of the demo_edu variable -print(levels(factor(data$demo_edu))) - -# Also show the unique values and their frequencies -print("\nUnique values and frequencies:") -print(table(data$demo_edu, useNA = "ifany")) - -# Recode demo_edu into 3 ordinal levels -data$demo_edu_3level <- NA - -# HS_TS: High School and Trade School -data$demo_edu_3level[data$demo_edu %in% c("High School (or equivalent)", "Trade School (non-military)")] <- "HS_TS" - -# C_Ug: College and University - Undergraduate -data$demo_edu_3level[data$demo_edu %in% c("College Diploma/Certificate", "University - Undergraduate")] <- "C_Ug" - -# grad_prof: University - Graduate, University - PhD, and Professional Degree -data$demo_edu_3level[data$demo_edu %in% c("University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)")] <- "grad_prof" - -# Convert to ordered factor -data$demo_edu_3level <- factor(data$demo_edu_3level, - levels = c("HS_TS", "C_Ug", "grad_prof"), - ordered = TRUE) - -# Check the recoded variable -print("\nRecoded education levels:") -print(table(data$demo_edu_3level, useNA = "ifany")) - -# Verify the recoding -print("\nCross-tabulation of original vs recoded:") -print(table(data$demo_edu, data$demo_edu_3level, useNA = "ifany")) \ No newline at end of file diff --git a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113658.r b/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113658.r deleted file mode 100644 index 5949cfd..0000000 --- a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113658.r +++ /dev/null @@ -1,37 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -data <- read.csv("ehi1.csv") - -# Check the levels of the demo_edu variable -print(levels(factor(data$demo_edu))) - -# Also show the unique values and their frequencies -print("\nUnique values and frequencies:") -print(table(data$demo_edu, useNA = "ifany")) - -# Recode demo_edu into 3 ordinal levels -data$demo_edu_3level <- NA - -# HS_TS: High School and Trade School -data$demo_edu_3level[data$demo_edu %in% c("High School (or equivalent)", "Trade School (non-military)")] <- "HS_TS" - -# C_Ug: College and University - Undergraduate -data$demo_edu_3level[data$demo_edu %in% c("College Diploma/Certificate", "University - Undergraduate")] <- "C_Ug" - -# grad_prof: University - Graduate, University - PhD, and Professional Degree -data$demo_edu_3level[data$demo_edu %in% c("University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)")] <- "grad_prof" - -# Convert to ordered factor -data$demo_edu_3level <- factor(data$demo_edu_3level, - levels = c("HS_TS", "C_Ug", "grad_prof"), - ordered = TRUE) - -# Check the recoded variable -print("\nRecoded education levels:") -print(table(data$demo_edu_3level, useNA = "ifany")) - -# Verify the recoding -print("\nCross-tabulation of original vs recoded:") -print(table(data$demo_edu, data$demo_edu_3level, useNA = "ifany")) \ No newline at end of file diff --git a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113726.r b/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113726.r deleted file mode 100644 index 1ee26e7..0000000 --- a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113726.r +++ /dev/null @@ -1,37 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -data <- read.csv("ehi1.csv") - -# Check the levels of the demo_edu variable -print(levels(factor(data$demo_edu))) - -# Also show the unique values and their frequencies -print("\nUnique values and frequencies:") -print(table(data$demo_edu, useNA = "ifany")) - -# Recode demo_edu into 3 ordinal levels -data$edu3 <- NA - -# HS_TS: High School and Trade School -data$edu3[data$demo_edu %in% c("High School (or equivalent)", "Trade School (non-military)")] <- "HS_TS" - -# C_Ug: College and University - Undergraduate -data$edu3[data$demo_edu %in% c("College Diploma/Certificate", "University - Undergraduate")] <- "C_Ug" - -# grad_prof: University - Graduate, University - PhD, and Professional Degree -data$edu3[data$demo_edu %in% c("University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)")] <- "grad_prof" - -# Convert to ordered factor -data$edu3 <- factor(data$edu3, - levels = c("HS_TS", "C_Ug", "grad_prof"), - ordered = TRUE) - -# Check the recoded variable -print("\nRecoded education levels:") -print(table(data$edu3, useNA = "ifany")) - -# Verify the recoding -print("\nCross-tabulation of original vs recoded:") -print(table(data$demo_edu, data$edu3, useNA = "ifany")) \ No newline at end of file diff --git a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113730.r b/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113730.r deleted file mode 100644 index 1ee26e7..0000000 --- a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113730.r +++ /dev/null @@ -1,37 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -data <- read.csv("ehi1.csv") - -# Check the levels of the demo_edu variable -print(levels(factor(data$demo_edu))) - -# Also show the unique values and their frequencies -print("\nUnique values and frequencies:") -print(table(data$demo_edu, useNA = "ifany")) - -# Recode demo_edu into 3 ordinal levels -data$edu3 <- NA - -# HS_TS: High School and Trade School -data$edu3[data$demo_edu %in% c("High School (or equivalent)", "Trade School (non-military)")] <- "HS_TS" - -# C_Ug: College and University - Undergraduate -data$edu3[data$demo_edu %in% c("College Diploma/Certificate", "University - Undergraduate")] <- "C_Ug" - -# grad_prof: University - Graduate, University - PhD, and Professional Degree -data$edu3[data$demo_edu %in% c("University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)")] <- "grad_prof" - -# Convert to ordered factor -data$edu3 <- factor(data$edu3, - levels = c("HS_TS", "C_Ug", "grad_prof"), - ordered = TRUE) - -# Check the recoded variable -print("\nRecoded education levels:") -print(table(data$edu3, useNA = "ifany")) - -# Verify the recoding -print("\nCross-tabulation of original vs recoded:") -print(table(data$demo_edu, data$edu3, useNA = "ifany")) \ No newline at end of file diff --git a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113735.r b/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113735.r deleted file mode 100644 index 1ee26e7..0000000 --- a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113735.r +++ /dev/null @@ -1,37 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -data <- read.csv("ehi1.csv") - -# Check the levels of the demo_edu variable -print(levels(factor(data$demo_edu))) - -# Also show the unique values and their frequencies -print("\nUnique values and frequencies:") -print(table(data$demo_edu, useNA = "ifany")) - -# Recode demo_edu into 3 ordinal levels -data$edu3 <- NA - -# HS_TS: High School and Trade School -data$edu3[data$demo_edu %in% c("High School (or equivalent)", "Trade School (non-military)")] <- "HS_TS" - -# C_Ug: College and University - Undergraduate -data$edu3[data$demo_edu %in% c("College Diploma/Certificate", "University - Undergraduate")] <- "C_Ug" - -# grad_prof: University - Graduate, University - PhD, and Professional Degree -data$edu3[data$demo_edu %in% c("University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)")] <- "grad_prof" - -# Convert to ordered factor -data$edu3 <- factor(data$edu3, - levels = c("HS_TS", "C_Ug", "grad_prof"), - ordered = TRUE) - -# Check the recoded variable -print("\nRecoded education levels:") -print(table(data$edu3, useNA = "ifany")) - -# Verify the recoding -print("\nCross-tabulation of original vs recoded:") -print(table(data$demo_edu, data$edu3, useNA = "ifany")) \ No newline at end of file diff --git a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113937.r b/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113937.r deleted file mode 100644 index 4649b36..0000000 --- a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113937.r +++ /dev/null @@ -1,39 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -data <- read.csv("ehi1.csv") - -# Check the levels of the demo_edu variable -print(levels(factor(data$demo_edu))) - -# Also show the unique values and their frequencies -print("\nUnique values and frequencies:") -print(table(data$demo_edu, useNA = "ifany")) - -# Recode demo_edu into 3 ordinal levels -data$edu3 <- NA - -# HS_TS: High School and Trade School -data$edu3[data$demo_edu %in% c("High School (or equivalent)", "Trade School (non-military)")] <- "HS_TS" - -# C_Ug: College and University - Undergraduate -data$edu3[data$demo_edu %in% c("College Diploma/Certificate", "University - Undergraduate")] <- "C_Ug" - -# grad_prof: University - Graduate, University - PhD, and Professional Degree -data$edu3[data$demo_edu %in% c("University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)")] <- "grad_prof" - -# Convert to ordered factor -data$edu3 <- factor(data$edu3, - levels = c("HS_TS", "C_Ug", "grad_prof"), - ordered = TRUE) - -# Check the recoded variable -print(table(data$edu3, useNA = "ifany")) - -# Verify the recoding -print(table(data$demo_edu, data$edu3, useNA = "ifany")) - -# Save the updated dataset with the new edu3 variable -write.csv(data, "ehi1.csv", row.names = FALSE) -print("\nDataset saved to ehi1.csv with new edu3 variable") \ No newline at end of file diff --git a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113941.r b/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113941.r deleted file mode 100644 index 4649b36..0000000 --- a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113941.r +++ /dev/null @@ -1,39 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -data <- read.csv("ehi1.csv") - -# Check the levels of the demo_edu variable -print(levels(factor(data$demo_edu))) - -# Also show the unique values and their frequencies -print("\nUnique values and frequencies:") -print(table(data$demo_edu, useNA = "ifany")) - -# Recode demo_edu into 3 ordinal levels -data$edu3 <- NA - -# HS_TS: High School and Trade School -data$edu3[data$demo_edu %in% c("High School (or equivalent)", "Trade School (non-military)")] <- "HS_TS" - -# C_Ug: College and University - Undergraduate -data$edu3[data$demo_edu %in% c("College Diploma/Certificate", "University - Undergraduate")] <- "C_Ug" - -# grad_prof: University - Graduate, University - PhD, and Professional Degree -data$edu3[data$demo_edu %in% c("University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)")] <- "grad_prof" - -# Convert to ordered factor -data$edu3 <- factor(data$edu3, - levels = c("HS_TS", "C_Ug", "grad_prof"), - ordered = TRUE) - -# Check the recoded variable -print(table(data$edu3, useNA = "ifany")) - -# Verify the recoding -print(table(data$demo_edu, data$edu3, useNA = "ifany")) - -# Save the updated dataset with the new edu3 variable -write.csv(data, "ehi1.csv", row.names = FALSE) -print("\nDataset saved to ehi1.csv with new edu3 variable") \ No newline at end of file diff --git a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113947.r b/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113947.r deleted file mode 100644 index 7d647e3..0000000 --- a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027113947.r +++ /dev/null @@ -1,38 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -data <- read.csv("ehi1.csv") - -# Check the levels of the demo_edu variable -print(levels(factor(data$demo_edu))) - -# Also show the unique values and their frequencies -print("\nUnique values and frequencies:") -print(table(data$demo_edu, useNA = "ifany")) - -# Recode demo_edu into 3 ordinal levels -data$edu3 <- NA - -# HS_TS: High School and Trade School -data$edu3[data$demo_edu %in% c("High School (or equivalent)", "Trade School (non-military)")] <- "HS_TS" - -# C_Ug: College and University - Undergraduate -data$edu3[data$demo_edu %in% c("College Diploma/Certificate", "University - Undergraduate")] <- "C_Ug" - -# grad_prof: University - Graduate, University - PhD, and Professional Degree -data$edu3[data$demo_edu %in% c("University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)")] <- "grad_prof" - -# Convert to ordered factor -data$edu3 <- factor(data$edu3, - levels = c("HS_TS", "C_Ug", "grad_prof"), - ordered = TRUE) - -# Check the recoded variable -print(table(data$edu3, useNA = "ifany")) - -# Verify the recoding -print(table(data$demo_edu, data$edu3, useNA = "ifany")) - -# Save the updated dataset with the new edu3 variable -write.csv(data, "ehi1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027114131.r b/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027114131.r deleted file mode 100644 index 7d647e3..0000000 --- a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027114131.r +++ /dev/null @@ -1,38 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -data <- read.csv("ehi1.csv") - -# Check the levels of the demo_edu variable -print(levels(factor(data$demo_edu))) - -# Also show the unique values and their frequencies -print("\nUnique values and frequencies:") -print(table(data$demo_edu, useNA = "ifany")) - -# Recode demo_edu into 3 ordinal levels -data$edu3 <- NA - -# HS_TS: High School and Trade School -data$edu3[data$demo_edu %in% c("High School (or equivalent)", "Trade School (non-military)")] <- "HS_TS" - -# C_Ug: College and University - Undergraduate -data$edu3[data$demo_edu %in% c("College Diploma/Certificate", "University - Undergraduate")] <- "C_Ug" - -# grad_prof: University - Graduate, University - PhD, and Professional Degree -data$edu3[data$demo_edu %in% c("University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)")] <- "grad_prof" - -# Convert to ordered factor -data$edu3 <- factor(data$edu3, - levels = c("HS_TS", "C_Ug", "grad_prof"), - ordered = TRUE) - -# Check the recoded variable -print(table(data$edu3, useNA = "ifany")) - -# Verify the recoding -print(table(data$demo_edu, data$edu3, useNA = "ifany")) - -# Save the updated dataset with the new edu3 variable -write.csv(data, "ehi1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115643.r b/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115643.r deleted file mode 100644 index 22663f5..0000000 --- a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115643.r +++ /dev/null @@ -1,49 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -data <- read.csv("ehi1.csv") - -# Check the levels of the demo_edu variable -print(levels(factor(data$demo_edu))) - -# Also show the unique values and their frequencies -print("\nUnique values and frequencies:") -print(table(data$demo_edu, useNA = "ifany")) - -# Recode demo_edu into 3 ordinal levels -data$edu3 <- NA - -# HS_TS: High School and Trade School -data$edu3[data$demo_edu %in% c("High School (or equivalent)", "Trade School (non-military)")] <- "HS_TS" - -# C_Ug: College and University - Undergraduate -data$edu3[data$demo_edu %in% c("College Diploma/Certificate", "University - Undergraduate")] <- "C_Ug" - -# grad_prof: University - Graduate, University - PhD, and Professional Degree -data$edu3[data$demo_edu %in% c("University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)")] <- "grad_prof" - -# Convert to ordered factor -data$edu3 <- factor(data$edu3, - levels = c("HS_TS", "C_Ug", "grad_prof"), - ordered = TRUE) - -# Check the recoded variable -print(table(data$edu3, useNA = "ifany")) - -# Verify the recoding -print(table(data$demo_edu, data$edu3, useNA = "ifany")) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -data$edu3_numeric <- as.numeric(data$edu3) - -# Check the numeric conversion -print("\nNumeric education levels:") -print(table(data$edu3_numeric, useNA = "ifany")) - -# Verify the conversion -print("\nCross-tabulation of factor vs numeric:") -print(table(data$edu3, data$edu3_numeric, useNA = "ifany")) - -# Save the updated dataset with the new edu3 variable -write.csv(data, "ehi1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115647.r b/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115647.r deleted file mode 100644 index 7f5a917..0000000 --- a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115647.r +++ /dev/null @@ -1,49 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -data <- read.csv("ehi1.csv") - -# Check the levels of the demo_edu variable -print(levels(factor(data$demo_edu))) - -# Also show the unique values and their frequencies -print("\nUnique values and frequencies:") -print(table(data$demo_edu, useNA = "ifany")) - -# Recode demo_edu into 3 ordinal levels -data$edu3 <- NA - -# HS_TS: High School and Trade School -data$edu3[data$demo_edu %in% c("High School (or equivalent)", "Trade School (non-military)")] <- "HS_TS" - -# C_Ug: College and University - Undergraduate -data$edu3[data$demo_edu %in% c("College Diploma/Certificate", "University - Undergraduate")] <- "C_Ug" - -# grad_prof: University - Graduate, University - PhD, and Professional Degree -data$edu3[data$demo_edu %in% c("University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)")] <- "grad_prof" - -# Convert to ordered factor -data$edu3 <- factor(data$edu3, - levels = c("HS_TS", "C_Ug", "grad_prof"), - ordered = TRUE) - -# Check the recoded variable -print(table(data$edu3, useNA = "ifany")) - -# Verify the recoding -print(table(data$demo_edu, data$edu3, useNA = "ifany")) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -data$edu3_numeric <- as.numeric(data$edu3) - -# Check the numeric conversion -print("\nNumeric education levels:") -print(table(data$edu3_numeric, useNA = "ifany")) - -# Verify the conversion -print("\nCross-tabulation of factor vs numeric:") -print(table(data$edu3, data$edu3_numeric, useNA = "ifany")) - -# Note: To save the dataset, close any programs that have ehi1.csv open, then run: -# write.csv(data, "ehi1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115649.r b/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115649.r deleted file mode 100644 index 7f5a917..0000000 --- a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115649.r +++ /dev/null @@ -1,49 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -data <- read.csv("ehi1.csv") - -# Check the levels of the demo_edu variable -print(levels(factor(data$demo_edu))) - -# Also show the unique values and their frequencies -print("\nUnique values and frequencies:") -print(table(data$demo_edu, useNA = "ifany")) - -# Recode demo_edu into 3 ordinal levels -data$edu3 <- NA - -# HS_TS: High School and Trade School -data$edu3[data$demo_edu %in% c("High School (or equivalent)", "Trade School (non-military)")] <- "HS_TS" - -# C_Ug: College and University - Undergraduate -data$edu3[data$demo_edu %in% c("College Diploma/Certificate", "University - Undergraduate")] <- "C_Ug" - -# grad_prof: University - Graduate, University - PhD, and Professional Degree -data$edu3[data$demo_edu %in% c("University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)")] <- "grad_prof" - -# Convert to ordered factor -data$edu3 <- factor(data$edu3, - levels = c("HS_TS", "C_Ug", "grad_prof"), - ordered = TRUE) - -# Check the recoded variable -print(table(data$edu3, useNA = "ifany")) - -# Verify the recoding -print(table(data$demo_edu, data$edu3, useNA = "ifany")) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -data$edu3_numeric <- as.numeric(data$edu3) - -# Check the numeric conversion -print("\nNumeric education levels:") -print(table(data$edu3_numeric, useNA = "ifany")) - -# Verify the conversion -print("\nCross-tabulation of factor vs numeric:") -print(table(data$edu3, data$edu3_numeric, useNA = "ifany")) - -# Note: To save the dataset, close any programs that have ehi1.csv open, then run: -# write.csv(data, "ehi1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115718.r b/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115718.r deleted file mode 100644 index 7d647e3..0000000 --- a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115718.r +++ /dev/null @@ -1,38 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -data <- read.csv("ehi1.csv") - -# Check the levels of the demo_edu variable -print(levels(factor(data$demo_edu))) - -# Also show the unique values and their frequencies -print("\nUnique values and frequencies:") -print(table(data$demo_edu, useNA = "ifany")) - -# Recode demo_edu into 3 ordinal levels -data$edu3 <- NA - -# HS_TS: High School and Trade School -data$edu3[data$demo_edu %in% c("High School (or equivalent)", "Trade School (non-military)")] <- "HS_TS" - -# C_Ug: College and University - Undergraduate -data$edu3[data$demo_edu %in% c("College Diploma/Certificate", "University - Undergraduate")] <- "C_Ug" - -# grad_prof: University - Graduate, University - PhD, and Professional Degree -data$edu3[data$demo_edu %in% c("University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)")] <- "grad_prof" - -# Convert to ordered factor -data$edu3 <- factor(data$edu3, - levels = c("HS_TS", "C_Ug", "grad_prof"), - ordered = TRUE) - -# Check the recoded variable -print(table(data$edu3, useNA = "ifany")) - -# Verify the recoding -print(table(data$demo_edu, data$edu3, useNA = "ifany")) - -# Save the updated dataset with the new edu3 variable -write.csv(data, "ehi1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115829.r b/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115829.r deleted file mode 100644 index a826447..0000000 --- a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115829.r +++ /dev/null @@ -1,50 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -data <- read.csv("ehi1.csv") - -# Check the levels of the demo_edu variable -print(levels(factor(data$demo_edu))) - -# Also show the unique values and their frequencies -print("\nUnique values and frequencies:") -print(table(data$demo_edu, useNA = "ifany")) - -# Recode demo_edu into 3 ordinal levels -data$edu3 <- NA - -# HS_TS: High School and Trade School -data$edu3[data$demo_edu %in% c("High School (or equivalent)", "Trade School (non-military)")] <- "HS_TS" - -# C_Ug: College and University - Undergraduate -data$edu3[data$demo_edu %in% c("College Diploma/Certificate", "University - Undergraduate")] <- "C_Ug" - -# grad_prof: University - Graduate, University - PhD, and Professional Degree -data$edu3[data$demo_edu %in% c("University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)")] <- "grad_prof" - -# Convert to ordered factor -data$edu3 <- factor(data$edu3, - levels = c("HS_TS", "C_Ug", "grad_prof"), - ordered = TRUE) - -# Check the recoded variable -print(table(data$edu3, useNA = "ifany")) - -# Verify the recoding -print(table(data$demo_edu, data$edu3, useNA = "ifany")) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print("\nCross-tabulation of factor vs numeric:") -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -# Save the updated dataset with the new edu3 variable -write.csv(data, "ehi1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115832.r b/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115832.r deleted file mode 100644 index f2938a7..0000000 --- a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115832.r +++ /dev/null @@ -1,50 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -data <- read.csv("ehi1.csv") - -# Check the levels of the demo_edu variable -print(levels(factor(data$demo_edu))) - -# Also show the unique values and their frequencies -print("\nUnique values and frequencies:") -print(table(data$demo_edu, useNA = "ifany")) - -# Recode demo_edu into 3 ordinal levels -data$edu3 <- NA - -# HS_TS: High School and Trade School -data$edu3[data$demo_edu %in% c("High School (or equivalent)", "Trade School (non-military)")] <- "HS_TS" - -# C_Ug: College and University - Undergraduate -data$edu3[data$demo_edu %in% c("College Diploma/Certificate", "University - Undergraduate")] <- "C_Ug" - -# grad_prof: University - Graduate, University - PhD, and Professional Degree -data$edu3[data$demo_edu %in% c("University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)")] <- "grad_prof" - -# Convert to ordered factor -data$edu3 <- factor(data$edu3, - levels = c("HS_TS", "C_Ug", "grad_prof"), - ordered = TRUE) - -# Check the recoded variable -print(table(data$edu3, useNA = "ifany")) - -# Verify the recoding -print(table(data$demo_edu, data$edu3, useNA = "ifany")) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print("\nCross-tabulation of factor vs numeric:") -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -# Note: To save the dataset, close any programs that have ehi1.csv open, then run: -# write.csv(data, "ehi1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115834.r b/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115834.r deleted file mode 100644 index f2938a7..0000000 --- a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115834.r +++ /dev/null @@ -1,50 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -data <- read.csv("ehi1.csv") - -# Check the levels of the demo_edu variable -print(levels(factor(data$demo_edu))) - -# Also show the unique values and their frequencies -print("\nUnique values and frequencies:") -print(table(data$demo_edu, useNA = "ifany")) - -# Recode demo_edu into 3 ordinal levels -data$edu3 <- NA - -# HS_TS: High School and Trade School -data$edu3[data$demo_edu %in% c("High School (or equivalent)", "Trade School (non-military)")] <- "HS_TS" - -# C_Ug: College and University - Undergraduate -data$edu3[data$demo_edu %in% c("College Diploma/Certificate", "University - Undergraduate")] <- "C_Ug" - -# grad_prof: University - Graduate, University - PhD, and Professional Degree -data$edu3[data$demo_edu %in% c("University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)")] <- "grad_prof" - -# Convert to ordered factor -data$edu3 <- factor(data$edu3, - levels = c("HS_TS", "C_Ug", "grad_prof"), - ordered = TRUE) - -# Check the recoded variable -print(table(data$edu3, useNA = "ifany")) - -# Verify the recoding -print(table(data$demo_edu, data$edu3, useNA = "ifany")) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print("\nCross-tabulation of factor vs numeric:") -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -# Note: To save the dataset, close any programs that have ehi1.csv open, then run: -# write.csv(data, "ehi1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115845.r b/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115845.r deleted file mode 100644 index 7d647e3..0000000 --- a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027115845.r +++ /dev/null @@ -1,38 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -data <- read.csv("ehi1.csv") - -# Check the levels of the demo_edu variable -print(levels(factor(data$demo_edu))) - -# Also show the unique values and their frequencies -print("\nUnique values and frequencies:") -print(table(data$demo_edu, useNA = "ifany")) - -# Recode demo_edu into 3 ordinal levels -data$edu3 <- NA - -# HS_TS: High School and Trade School -data$edu3[data$demo_edu %in% c("High School (or equivalent)", "Trade School (non-military)")] <- "HS_TS" - -# C_Ug: College and University - Undergraduate -data$edu3[data$demo_edu %in% c("College Diploma/Certificate", "University - Undergraduate")] <- "C_Ug" - -# grad_prof: University - Graduate, University - PhD, and Professional Degree -data$edu3[data$demo_edu %in% c("University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)")] <- "grad_prof" - -# Convert to ordered factor -data$edu3 <- factor(data$edu3, - levels = c("HS_TS", "C_Ug", "grad_prof"), - ordered = TRUE) - -# Check the recoded variable -print(table(data$edu3, useNA = "ifany")) - -# Verify the recoding -print(table(data$demo_edu, data$edu3, useNA = "ifany")) - -# Save the updated dataset with the new edu3 variable -write.csv(data, "ehi1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027134607.r b/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027134607.r deleted file mode 100644 index 7d647e3..0000000 --- a/.history/eohi1/datap 15 - education recoded 3 ordinal levels_20251027134607.r +++ /dev/null @@ -1,38 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -data <- read.csv("ehi1.csv") - -# Check the levels of the demo_edu variable -print(levels(factor(data$demo_edu))) - -# Also show the unique values and their frequencies -print("\nUnique values and frequencies:") -print(table(data$demo_edu, useNA = "ifany")) - -# Recode demo_edu into 3 ordinal levels -data$edu3 <- NA - -# HS_TS: High School and Trade School -data$edu3[data$demo_edu %in% c("High School (or equivalent)", "Trade School (non-military)")] <- "HS_TS" - -# C_Ug: College and University - Undergraduate -data$edu3[data$demo_edu %in% c("College Diploma/Certificate", "University - Undergraduate")] <- "C_Ug" - -# grad_prof: University - Graduate, University - PhD, and Professional Degree -data$edu3[data$demo_edu %in% c("University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)")] <- "grad_prof" - -# Convert to ordered factor -data$edu3 <- factor(data$edu3, - levels = c("HS_TS", "C_Ug", "grad_prof"), - ordered = TRUE) - -# Check the recoded variable -print(table(data$edu3, useNA = "ifany")) - -# Verify the recoding -print(table(data$demo_edu, data$edu3, useNA = "ifany")) - -# Save the updated dataset with the new edu3 variable -write.csv(data, "ehi1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918115552.r b/.history/eohi1/descriptives - gen knowledge questions_20250918115552.r deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918115553.r b/.history/eohi1/descriptives - gen knowledge questions_20250918115553.r deleted file mode 100644 index 1fcac3c..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918115553.r +++ /dev/null @@ -1,32 +0,0 @@ -library(tidyverse) - -# Read data -df <- readr::read_csv("exp1.csv", show_col_types = FALSE) - -# Select variables ending exactly with _T or _F -df_tf <- df %>% select(matches("(_T|_F)$")) - -# Coerce to numeric where possible (without breaking non-numeric) -df_tf_num <- df_tf %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Compute descriptives per variable -descriptives <- df_tf_num %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - summarise( - n = sum(!is.na(value)), - missing = sum(is.na(value)), - mean = mean(value, na.rm = TRUE), - sd = sd(value, na.rm = TRUE), - median = median(value, na.rm = TRUE), - min = suppressWarnings(min(value, na.rm = TRUE)), - max = suppressWarnings(max(value, na.rm = TRUE)), - .by = "variable" - ) %>% - arrange(variable) - -# View -print(descriptives, n = Inf) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918115703.r b/.history/eohi1/descriptives - gen knowledge questions_20250918115703.r deleted file mode 100644 index 3bc0a8a..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918115703.r +++ /dev/null @@ -1,34 +0,0 @@ -library(tidyverse) - -# Read data -data <- read.csv("exp1.csv") - -# Select variables ending exactly with _T or _F -df <- data %>% select(matches("(_T|_F)$")) - -str(df) - -# Coerce to numeric where possible (without breaking non-numeric) -df_tf_num <- df_tf %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Compute descriptives per variable -descriptives <- df_tf_num %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - summarise( - n = sum(!is.na(value)), - missing = sum(is.na(value)), - mean = mean(value, na.rm = TRUE), - sd = sd(value, na.rm = TRUE), - median = median(value, na.rm = TRUE), - min = suppressWarnings(min(value, na.rm = TRUE)), - max = suppressWarnings(max(value, na.rm = TRUE)), - .by = "variable" - ) %>% - arrange(variable) - -# View -print(descriptives, n = Inf) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918120055.r b/.history/eohi1/descriptives - gen knowledge questions_20250918120055.r deleted file mode 100644 index a280963..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918120055.r +++ /dev/null @@ -1,39 +0,0 @@ -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read data -data <- read.csv("exp1.csv") - -# Select variables ending exactly with _T or _F -df <- data %>% select(matches("(_T|_F)$")) - -# Remove demo_f variable -df <- df %>% select(-demo_f) - -str(df) - -# Coerce to numeric where possible (without breaking non-numeric) -df_tf_num <- df %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Compute descriptives per variable -descriptives <- df_tf_num %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - summarise( - n = sum(!is.na(value)), - missing = sum(is.na(value)), - mean = mean(value, na.rm = TRUE), - sd = sd(value, na.rm = TRUE), - median = median(value, na.rm = TRUE), - min = suppressWarnings(min(value, na.rm = TRUE)), - max = suppressWarnings(max(value, na.rm = TRUE)), - .by = "variable" - ) %>% - arrange(variable) - -# View -print(descriptives, n = Inf) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918120100.r b/.history/eohi1/descriptives - gen knowledge questions_20250918120100.r deleted file mode 100644 index a280963..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918120100.r +++ /dev/null @@ -1,39 +0,0 @@ -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read data -data <- read.csv("exp1.csv") - -# Select variables ending exactly with _T or _F -df <- data %>% select(matches("(_T|_F)$")) - -# Remove demo_f variable -df <- df %>% select(-demo_f) - -str(df) - -# Coerce to numeric where possible (without breaking non-numeric) -df_tf_num <- df %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Compute descriptives per variable -descriptives <- df_tf_num %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - summarise( - n = sum(!is.na(value)), - missing = sum(is.na(value)), - mean = mean(value, na.rm = TRUE), - sd = sd(value, na.rm = TRUE), - median = median(value, na.rm = TRUE), - min = suppressWarnings(min(value, na.rm = TRUE)), - max = suppressWarnings(max(value, na.rm = TRUE)), - .by = "variable" - ) %>% - arrange(variable) - -# View -print(descriptives, n = Inf) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918120102.r b/.history/eohi1/descriptives - gen knowledge questions_20250918120102.r deleted file mode 100644 index a280963..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918120102.r +++ /dev/null @@ -1,39 +0,0 @@ -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read data -data <- read.csv("exp1.csv") - -# Select variables ending exactly with _T or _F -df <- data %>% select(matches("(_T|_F)$")) - -# Remove demo_f variable -df <- df %>% select(-demo_f) - -str(df) - -# Coerce to numeric where possible (without breaking non-numeric) -df_tf_num <- df %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Compute descriptives per variable -descriptives <- df_tf_num %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - summarise( - n = sum(!is.na(value)), - missing = sum(is.na(value)), - mean = mean(value, na.rm = TRUE), - sd = sd(value, na.rm = TRUE), - median = median(value, na.rm = TRUE), - min = suppressWarnings(min(value, na.rm = TRUE)), - max = suppressWarnings(max(value, na.rm = TRUE)), - .by = "variable" - ) %>% - arrange(variable) - -# View -print(descriptives, n = Inf) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918120515.r b/.history/eohi1/descriptives - gen knowledge questions_20250918120515.r deleted file mode 100644 index 4886004..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918120515.r +++ /dev/null @@ -1,35 +0,0 @@ -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read data -data <- read.csv("exp1.csv") - -# Select variables ending exactly with _T or _F -df <- data %>% select(matches("(_T|_F)$")) - -# Remove demo_f variable (if present) -df <- df %>% select(-any_of("demo_f")) - -str(df) - -# Coerce to numeric where possible (without breaking non-numeric) -df_num <- df %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Compute count and proportion correct (value == 1) per variable -descriptives <- df_num %>% - pivot_longer(everything(), names_to = "variable", values_to = "value") %>% - summarise( - n_total = sum(!is.na(value)), - n_correct = sum(value == 1, na.rm = TRUE), - prop_correct = ifelse(n_total > 0, n_correct / n_total, NA_real_), - .by = "variable" - ) %>% - arrange(variable) - -# View -print(descriptives, n = Inf) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918120600.r b/.history/eohi1/descriptives - gen knowledge questions_20250918120600.r deleted file mode 100644 index 35bfda1..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918120600.r +++ /dev/null @@ -1,64 +0,0 @@ -library(tidyverse) -if (!requireNamespace("boot", quietly = TRUE)) install.packages("boot") -library(boot) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read data -data <- read.csv("exp1.csv") - -# Select variables ending exactly with _T or _F -df <- data %>% select(matches("(_T|_F)$")) - -# Remove demo_f variable (if present) -df <- df %>% select(-any_of("demo_f")) - -str(df) - -# Coerce to numeric where possible (without breaking non-numeric) -df_num <- df %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Helper: bootstrap BCA CI for proportion correct -compute_prop_ci <- function(x, R = 1000, conf = 0.95) { - x <- suppressWarnings(as.numeric(x)) - x <- x[!is.na(x)] - n_total <- length(x) - if (n_total == 0) { - return(list(n_total = 0L, n_correct = NA_integer_, prop = NA_real_, ci_lower = NA_real_, ci_upper = NA_real_)) - } - x01 <- ifelse(x == 1, 1, 0) - n_correct <- sum(x01) - prop <- n_correct / n_total - stat <- function(data, indices) mean(data[indices]) - b <- boot::boot(data = x01, statistic = stat, R = R) - ci <- tryCatch(boot::boot.ci(b, conf = conf, type = "bca"), error = function(e) NULL) - if (is.null(ci) || is.null(ci$bca)) { - lower <- NA_real_ - upper <- NA_real_ - } else { - lower <- ci$bca[4] - upper <- ci$bca[5] - } - list(n_total = n_total, n_correct = n_correct, prop = prop, ci_lower = lower, ci_upper = upper) -} - -# Compute count, proportion correct, and 95% BCA CI per variable -descriptives <- purrr::imap_dfr(df_num, function(col, name) { - res <- compute_prop_ci(col, R = 1000, conf = 0.95) - tibble( - variable = name, - n_total = res$n_total, - n_correct = res$n_correct, - prop_correct = round(res$prop, 5), - ci_lower = round(res$ci_lower, 5), - ci_upper = round(res$ci_upper, 5) - ) -}) %>% - arrange(variable) - -# View -print(descriptives, n = Inf) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918120656.r b/.history/eohi1/descriptives - gen knowledge questions_20250918120656.r deleted file mode 100644 index 35bfda1..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918120656.r +++ /dev/null @@ -1,64 +0,0 @@ -library(tidyverse) -if (!requireNamespace("boot", quietly = TRUE)) install.packages("boot") -library(boot) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read data -data <- read.csv("exp1.csv") - -# Select variables ending exactly with _T or _F -df <- data %>% select(matches("(_T|_F)$")) - -# Remove demo_f variable (if present) -df <- df %>% select(-any_of("demo_f")) - -str(df) - -# Coerce to numeric where possible (without breaking non-numeric) -df_num <- df %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Helper: bootstrap BCA CI for proportion correct -compute_prop_ci <- function(x, R = 1000, conf = 0.95) { - x <- suppressWarnings(as.numeric(x)) - x <- x[!is.na(x)] - n_total <- length(x) - if (n_total == 0) { - return(list(n_total = 0L, n_correct = NA_integer_, prop = NA_real_, ci_lower = NA_real_, ci_upper = NA_real_)) - } - x01 <- ifelse(x == 1, 1, 0) - n_correct <- sum(x01) - prop <- n_correct / n_total - stat <- function(data, indices) mean(data[indices]) - b <- boot::boot(data = x01, statistic = stat, R = R) - ci <- tryCatch(boot::boot.ci(b, conf = conf, type = "bca"), error = function(e) NULL) - if (is.null(ci) || is.null(ci$bca)) { - lower <- NA_real_ - upper <- NA_real_ - } else { - lower <- ci$bca[4] - upper <- ci$bca[5] - } - list(n_total = n_total, n_correct = n_correct, prop = prop, ci_lower = lower, ci_upper = upper) -} - -# Compute count, proportion correct, and 95% BCA CI per variable -descriptives <- purrr::imap_dfr(df_num, function(col, name) { - res <- compute_prop_ci(col, R = 1000, conf = 0.95) - tibble( - variable = name, - n_total = res$n_total, - n_correct = res$n_correct, - prop_correct = round(res$prop, 5), - ci_lower = round(res$ci_lower, 5), - ci_upper = round(res$ci_upper, 5) - ) -}) %>% - arrange(variable) - -# View -print(descriptives, n = Inf) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918120727.r b/.history/eohi1/descriptives - gen knowledge questions_20250918120727.r deleted file mode 100644 index 4c3cd5d..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918120727.r +++ /dev/null @@ -1,63 +0,0 @@ -library(tidyverse) -library(boot) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read data -data <- read.csv("exp1.csv") - -# Select variables ending exactly with _T or _F -df <- data %>% select(matches("(_T|_F)$")) - -# Remove demo_f variable (if present) -df <- df %>% select(-any_of("demo_f")) - -str(df) - -# Coerce to numeric where possible (without breaking non-numeric) -df_num <- df %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Helper: bootstrap BCA CI for proportion correct -compute_prop_ci <- function(x, R = 1000, conf = 0.95) { - x <- suppressWarnings(as.numeric(x)) - x <- x[!is.na(x)] - n_total <- length(x) - if (n_total == 0) { - return(list(n_total = 0L, n_correct = NA_integer_, prop = NA_real_, ci_lower = NA_real_, ci_upper = NA_real_)) - } - x01 <- ifelse(x == 1, 1, 0) - n_correct <- sum(x01) - prop <- n_correct / n_total - stat <- function(data, indices) mean(data[indices]) - b <- boot::boot(data = x01, statistic = stat, R = R) - ci <- tryCatch(boot::boot.ci(b, conf = conf, type = "bca"), error = function(e) NULL) - if (is.null(ci) || is.null(ci$bca)) { - lower <- NA_real_ - upper <- NA_real_ - } else { - lower <- ci$bca[4] - upper <- ci$bca[5] - } - list(n_total = n_total, n_correct = n_correct, prop = prop, ci_lower = lower, ci_upper = upper) -} - -# Compute count, proportion correct, and 95% BCA CI per variable -descriptives <- purrr::imap_dfr(df_num, function(col, name) { - res <- compute_prop_ci(col, R = 1000, conf = 0.95) - tibble( - variable = name, - n_total = res$n_total, - n_correct = res$n_correct, - prop_correct = round(res$prop, 5), - ci_lower = round(res$ci_lower, 5), - ci_upper = round(res$ci_upper, 5) - ) -}) %>% - arrange(variable) - -# View -print(descriptives, n = Inf) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918122358.r b/.history/eohi1/descriptives - gen knowledge questions_20250918122358.r deleted file mode 100644 index 6995cb1..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918122358.r +++ /dev/null @@ -1,55 +0,0 @@ -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read data -data <- read.csv("exp1.csv") - -# Select variables ending exactly with _T or _F -df <- data %>% select(matches("(_T|_F)$")) - -# Remove demo_f variable (if present) -df <- df %>% select(-any_of("demo_f")) - -str(df) - -# Coerce to numeric where possible (without breaking non-numeric) -df_num <- df %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Compute count and proportion correct per variable -descriptives <- purrr::imap_dfr(df_num, function(col, name) { - x <- suppressWarnings(as.numeric(col)) - x <- x[!is.na(x)] - n_total <- length(x) - n_correct <- if (n_total == 0) NA_integer_ else sum(x == 1) - prop <- if (n_total == 0) NA_real_ else n_correct / n_total - tibble( - variable = name, - n_total = n_total, - n_correct = n_correct, - prop_correct = round(prop, 5) - ) -}) %>% - arrange(variable) - -# Bin proportions into .10-.19, .20-.29, ..., .90-.99 and count variables per bin -bin_levels <- sapply(1:9, function(k) sprintf("%.2f-%.2f", k / 10, k / 10 + 0.09)) -bin_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.10, 1.00, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin_levels -) -bin_counts <- tibble(bin = factor(bin_factor, levels = bin_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# View -print(descriptives, n = Inf) -cat("\nBin counts (.10-.19, .20-.29, ..., .90-.99):\n") -print(bin_counts, n = Inf) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918122401.r b/.history/eohi1/descriptives - gen knowledge questions_20250918122401.r deleted file mode 100644 index 6995cb1..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918122401.r +++ /dev/null @@ -1,55 +0,0 @@ -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read data -data <- read.csv("exp1.csv") - -# Select variables ending exactly with _T or _F -df <- data %>% select(matches("(_T|_F)$")) - -# Remove demo_f variable (if present) -df <- df %>% select(-any_of("demo_f")) - -str(df) - -# Coerce to numeric where possible (without breaking non-numeric) -df_num <- df %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Compute count and proportion correct per variable -descriptives <- purrr::imap_dfr(df_num, function(col, name) { - x <- suppressWarnings(as.numeric(col)) - x <- x[!is.na(x)] - n_total <- length(x) - n_correct <- if (n_total == 0) NA_integer_ else sum(x == 1) - prop <- if (n_total == 0) NA_real_ else n_correct / n_total - tibble( - variable = name, - n_total = n_total, - n_correct = n_correct, - prop_correct = round(prop, 5) - ) -}) %>% - arrange(variable) - -# Bin proportions into .10-.19, .20-.29, ..., .90-.99 and count variables per bin -bin_levels <- sapply(1:9, function(k) sprintf("%.2f-%.2f", k / 10, k / 10 + 0.09)) -bin_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.10, 1.00, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin_levels -) -bin_counts <- tibble(bin = factor(bin_factor, levels = bin_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# View -print(descriptives, n = Inf) -cat("\nBin counts (.10-.19, .20-.29, ..., .90-.99):\n") -print(bin_counts, n = Inf) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918122413.r b/.history/eohi1/descriptives - gen knowledge questions_20250918122413.r deleted file mode 100644 index 6995cb1..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918122413.r +++ /dev/null @@ -1,55 +0,0 @@ -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read data -data <- read.csv("exp1.csv") - -# Select variables ending exactly with _T or _F -df <- data %>% select(matches("(_T|_F)$")) - -# Remove demo_f variable (if present) -df <- df %>% select(-any_of("demo_f")) - -str(df) - -# Coerce to numeric where possible (without breaking non-numeric) -df_num <- df %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Compute count and proportion correct per variable -descriptives <- purrr::imap_dfr(df_num, function(col, name) { - x <- suppressWarnings(as.numeric(col)) - x <- x[!is.na(x)] - n_total <- length(x) - n_correct <- if (n_total == 0) NA_integer_ else sum(x == 1) - prop <- if (n_total == 0) NA_real_ else n_correct / n_total - tibble( - variable = name, - n_total = n_total, - n_correct = n_correct, - prop_correct = round(prop, 5) - ) -}) %>% - arrange(variable) - -# Bin proportions into .10-.19, .20-.29, ..., .90-.99 and count variables per bin -bin_levels <- sapply(1:9, function(k) sprintf("%.2f-%.2f", k / 10, k / 10 + 0.09)) -bin_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.10, 1.00, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin_levels -) -bin_counts <- tibble(bin = factor(bin_factor, levels = bin_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# View -print(descriptives, n = Inf) -cat("\nBin counts (.10-.19, .20-.29, ..., .90-.99):\n") -print(bin_counts, n = Inf) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918122634.r b/.history/eohi1/descriptives - gen knowledge questions_20250918122634.r deleted file mode 100644 index 5368928..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918122634.r +++ /dev/null @@ -1,70 +0,0 @@ -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read data -data <- read.csv("exp1.csv") - -# Select variables ending exactly with _T or _F -df <- data %>% select(matches("(_T|_F)$")) - -# Remove demo_f variable (if present) -df <- df %>% select(-any_of("demo_f")) - -str(df) - -# Coerce to numeric where possible (without breaking non-numeric) -df_num <- df %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Compute count and proportion correct per variable -descriptives <- purrr::imap_dfr(df_num, function(col, name) { - x <- suppressWarnings(as.numeric(col)) - x <- x[!is.na(x)] - n_total <- length(x) - n_correct <- if (n_total == 0) NA_integer_ else sum(x == 1) - prop <- if (n_total == 0) NA_real_ else n_correct / n_total - tibble( - variable = name, - n_total = n_total, - n_correct = n_correct, - prop_correct = round(prop, 5) - ) -}) %>% - arrange(variable) - -# Bin proportions into .10-.19, .20-.29, ..., .90-.99 and count variables per bin -bin_levels <- sapply(1:9, function(k) sprintf("%.2f-%.2f", k / 10, k / 10 + 0.09)) -bin_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.10, 1.00, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin_levels -) -bin_counts <- tibble(bin = factor(bin_factor, levels = bin_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# Additional bins: 0.15-0.24, 0.25-0.34, ..., 0.85-0.94 -bin15_levels <- sapply(seq(0.15, 0.85, by = 0.10), function(lo) sprintf("%.2f-%.2f", lo, lo + 0.09)) -bin15_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.15, 0.95, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin15_levels -) -bin15_counts <- tibble(bin = factor(bin15_factor, levels = bin15_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# View -print(descriptives, n = Inf) -cat("\nBin counts (.10-.19, .20-.29, ..., .90-.99):\n") -print(bin_counts, n = Inf) -cat("\nBin counts (0.15-0.24, 0.25-0.34, ..., 0.85-0.94):\n") -print(bin15_counts, n = Inf) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918122637.r b/.history/eohi1/descriptives - gen knowledge questions_20250918122637.r deleted file mode 100644 index 5368928..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918122637.r +++ /dev/null @@ -1,70 +0,0 @@ -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read data -data <- read.csv("exp1.csv") - -# Select variables ending exactly with _T or _F -df <- data %>% select(matches("(_T|_F)$")) - -# Remove demo_f variable (if present) -df <- df %>% select(-any_of("demo_f")) - -str(df) - -# Coerce to numeric where possible (without breaking non-numeric) -df_num <- df %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Compute count and proportion correct per variable -descriptives <- purrr::imap_dfr(df_num, function(col, name) { - x <- suppressWarnings(as.numeric(col)) - x <- x[!is.na(x)] - n_total <- length(x) - n_correct <- if (n_total == 0) NA_integer_ else sum(x == 1) - prop <- if (n_total == 0) NA_real_ else n_correct / n_total - tibble( - variable = name, - n_total = n_total, - n_correct = n_correct, - prop_correct = round(prop, 5) - ) -}) %>% - arrange(variable) - -# Bin proportions into .10-.19, .20-.29, ..., .90-.99 and count variables per bin -bin_levels <- sapply(1:9, function(k) sprintf("%.2f-%.2f", k / 10, k / 10 + 0.09)) -bin_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.10, 1.00, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin_levels -) -bin_counts <- tibble(bin = factor(bin_factor, levels = bin_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# Additional bins: 0.15-0.24, 0.25-0.34, ..., 0.85-0.94 -bin15_levels <- sapply(seq(0.15, 0.85, by = 0.10), function(lo) sprintf("%.2f-%.2f", lo, lo + 0.09)) -bin15_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.15, 0.95, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin15_levels -) -bin15_counts <- tibble(bin = factor(bin15_factor, levels = bin15_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# View -print(descriptives, n = Inf) -cat("\nBin counts (.10-.19, .20-.29, ..., .90-.99):\n") -print(bin_counts, n = Inf) -cat("\nBin counts (0.15-0.24, 0.25-0.34, ..., 0.85-0.94):\n") -print(bin15_counts, n = Inf) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918122638.r b/.history/eohi1/descriptives - gen knowledge questions_20250918122638.r deleted file mode 100644 index 5368928..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918122638.r +++ /dev/null @@ -1,70 +0,0 @@ -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read data -data <- read.csv("exp1.csv") - -# Select variables ending exactly with _T or _F -df <- data %>% select(matches("(_T|_F)$")) - -# Remove demo_f variable (if present) -df <- df %>% select(-any_of("demo_f")) - -str(df) - -# Coerce to numeric where possible (without breaking non-numeric) -df_num <- df %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Compute count and proportion correct per variable -descriptives <- purrr::imap_dfr(df_num, function(col, name) { - x <- suppressWarnings(as.numeric(col)) - x <- x[!is.na(x)] - n_total <- length(x) - n_correct <- if (n_total == 0) NA_integer_ else sum(x == 1) - prop <- if (n_total == 0) NA_real_ else n_correct / n_total - tibble( - variable = name, - n_total = n_total, - n_correct = n_correct, - prop_correct = round(prop, 5) - ) -}) %>% - arrange(variable) - -# Bin proportions into .10-.19, .20-.29, ..., .90-.99 and count variables per bin -bin_levels <- sapply(1:9, function(k) sprintf("%.2f-%.2f", k / 10, k / 10 + 0.09)) -bin_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.10, 1.00, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin_levels -) -bin_counts <- tibble(bin = factor(bin_factor, levels = bin_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# Additional bins: 0.15-0.24, 0.25-0.34, ..., 0.85-0.94 -bin15_levels <- sapply(seq(0.15, 0.85, by = 0.10), function(lo) sprintf("%.2f-%.2f", lo, lo + 0.09)) -bin15_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.15, 0.95, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin15_levels -) -bin15_counts <- tibble(bin = factor(bin15_factor, levels = bin15_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# View -print(descriptives, n = Inf) -cat("\nBin counts (.10-.19, .20-.29, ..., .90-.99):\n") -print(bin_counts, n = Inf) -cat("\nBin counts (0.15-0.24, 0.25-0.34, ..., 0.85-0.94):\n") -print(bin15_counts, n = Inf) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918123114.r b/.history/eohi1/descriptives - gen knowledge questions_20250918123114.r deleted file mode 100644 index 25de7e7..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918123114.r +++ /dev/null @@ -1,88 +0,0 @@ -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read data -data <- read.csv("exp1.csv") - -# Select variables ending exactly with _T or _F -df <- data %>% select(matches("(_T|_F)$")) - -# Remove demo_f variable (if present) -df <- df %>% select(-any_of("demo_f")) - -str(df) - -# Coerce to numeric where possible (without breaking non-numeric) -df_num <- df %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Compute count and proportion correct per variable -descriptives <- purrr::imap_dfr(df_num, function(col, name) { - x <- suppressWarnings(as.numeric(col)) - x <- x[!is.na(x)] - n_total <- length(x) - n_correct <- if (n_total == 0) NA_integer_ else sum(x == 1) - prop <- if (n_total == 0) NA_real_ else n_correct / n_total - - # Extract difficulty number from variable name and map to expected range - difficulty_num <- as.numeric(gsub(".*_([0-9]+)_[TF]$", "\\1", name)) - expected_ranges <- list( - "15" = c(0.15, 0.25), - "35" = c(0.35, 0.45), - "55" = c(0.55, 0.65), - "75" = c(0.75, 0.85) - ) - - if (as.character(difficulty_num) %in% names(expected_ranges)) { - expected_range <- expected_ranges[[as.character(difficulty_num)]] - match_difficulty <- if (prop >= expected_range[1] && prop <= expected_range[2]) "YES" else "NO" - } else { - match_difficulty <- "UNKNOWN" - } - - tibble( - variable = name, - n_total = n_total, - n_correct = n_correct, - prop_correct = round(prop, 5), - match_difficulty = match_difficulty - ) -}) %>% - arrange(variable) - -# Bin proportions into .10-.19, .20-.29, ..., .90-.99 and count variables per bin -bin_levels <- sapply(1:9, function(k) sprintf("%.2f-%.2f", k / 10, k / 10 + 0.09)) -bin_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.10, 1.00, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin_levels -) -bin_counts <- tibble(bin = factor(bin_factor, levels = bin_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# Additional bins: 0.15-0.24, 0.25-0.34, ..., 0.85-0.94 -bin15_levels <- sapply(seq(0.15, 0.85, by = 0.10), function(lo) sprintf("%.2f-%.2f", lo, lo + 0.09)) -bin15_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.15, 0.95, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin15_levels -) -bin15_counts <- tibble(bin = factor(bin15_factor, levels = bin15_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# View -print(descriptives, n = Inf) -cat("\nBin counts (.10-.19, .20-.29, ..., .90-.99):\n") -print(bin_counts, n = Inf) -cat("\nBin counts (0.15-0.24, 0.25-0.34, ..., 0.85-0.94):\n") -print(bin15_counts, n = Inf) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918123117.r b/.history/eohi1/descriptives - gen knowledge questions_20250918123117.r deleted file mode 100644 index 25de7e7..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918123117.r +++ /dev/null @@ -1,88 +0,0 @@ -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read data -data <- read.csv("exp1.csv") - -# Select variables ending exactly with _T or _F -df <- data %>% select(matches("(_T|_F)$")) - -# Remove demo_f variable (if present) -df <- df %>% select(-any_of("demo_f")) - -str(df) - -# Coerce to numeric where possible (without breaking non-numeric) -df_num <- df %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Compute count and proportion correct per variable -descriptives <- purrr::imap_dfr(df_num, function(col, name) { - x <- suppressWarnings(as.numeric(col)) - x <- x[!is.na(x)] - n_total <- length(x) - n_correct <- if (n_total == 0) NA_integer_ else sum(x == 1) - prop <- if (n_total == 0) NA_real_ else n_correct / n_total - - # Extract difficulty number from variable name and map to expected range - difficulty_num <- as.numeric(gsub(".*_([0-9]+)_[TF]$", "\\1", name)) - expected_ranges <- list( - "15" = c(0.15, 0.25), - "35" = c(0.35, 0.45), - "55" = c(0.55, 0.65), - "75" = c(0.75, 0.85) - ) - - if (as.character(difficulty_num) %in% names(expected_ranges)) { - expected_range <- expected_ranges[[as.character(difficulty_num)]] - match_difficulty <- if (prop >= expected_range[1] && prop <= expected_range[2]) "YES" else "NO" - } else { - match_difficulty <- "UNKNOWN" - } - - tibble( - variable = name, - n_total = n_total, - n_correct = n_correct, - prop_correct = round(prop, 5), - match_difficulty = match_difficulty - ) -}) %>% - arrange(variable) - -# Bin proportions into .10-.19, .20-.29, ..., .90-.99 and count variables per bin -bin_levels <- sapply(1:9, function(k) sprintf("%.2f-%.2f", k / 10, k / 10 + 0.09)) -bin_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.10, 1.00, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin_levels -) -bin_counts <- tibble(bin = factor(bin_factor, levels = bin_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# Additional bins: 0.15-0.24, 0.25-0.34, ..., 0.85-0.94 -bin15_levels <- sapply(seq(0.15, 0.85, by = 0.10), function(lo) sprintf("%.2f-%.2f", lo, lo + 0.09)) -bin15_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.15, 0.95, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin15_levels -) -bin15_counts <- tibble(bin = factor(bin15_factor, levels = bin15_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# View -print(descriptives, n = Inf) -cat("\nBin counts (.10-.19, .20-.29, ..., .90-.99):\n") -print(bin_counts, n = Inf) -cat("\nBin counts (0.15-0.24, 0.25-0.34, ..., 0.85-0.94):\n") -print(bin15_counts, n = Inf) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918123133.r b/.history/eohi1/descriptives - gen knowledge questions_20250918123133.r deleted file mode 100644 index 25de7e7..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918123133.r +++ /dev/null @@ -1,88 +0,0 @@ -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read data -data <- read.csv("exp1.csv") - -# Select variables ending exactly with _T or _F -df <- data %>% select(matches("(_T|_F)$")) - -# Remove demo_f variable (if present) -df <- df %>% select(-any_of("demo_f")) - -str(df) - -# Coerce to numeric where possible (without breaking non-numeric) -df_num <- df %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Compute count and proportion correct per variable -descriptives <- purrr::imap_dfr(df_num, function(col, name) { - x <- suppressWarnings(as.numeric(col)) - x <- x[!is.na(x)] - n_total <- length(x) - n_correct <- if (n_total == 0) NA_integer_ else sum(x == 1) - prop <- if (n_total == 0) NA_real_ else n_correct / n_total - - # Extract difficulty number from variable name and map to expected range - difficulty_num <- as.numeric(gsub(".*_([0-9]+)_[TF]$", "\\1", name)) - expected_ranges <- list( - "15" = c(0.15, 0.25), - "35" = c(0.35, 0.45), - "55" = c(0.55, 0.65), - "75" = c(0.75, 0.85) - ) - - if (as.character(difficulty_num) %in% names(expected_ranges)) { - expected_range <- expected_ranges[[as.character(difficulty_num)]] - match_difficulty <- if (prop >= expected_range[1] && prop <= expected_range[2]) "YES" else "NO" - } else { - match_difficulty <- "UNKNOWN" - } - - tibble( - variable = name, - n_total = n_total, - n_correct = n_correct, - prop_correct = round(prop, 5), - match_difficulty = match_difficulty - ) -}) %>% - arrange(variable) - -# Bin proportions into .10-.19, .20-.29, ..., .90-.99 and count variables per bin -bin_levels <- sapply(1:9, function(k) sprintf("%.2f-%.2f", k / 10, k / 10 + 0.09)) -bin_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.10, 1.00, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin_levels -) -bin_counts <- tibble(bin = factor(bin_factor, levels = bin_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# Additional bins: 0.15-0.24, 0.25-0.34, ..., 0.85-0.94 -bin15_levels <- sapply(seq(0.15, 0.85, by = 0.10), function(lo) sprintf("%.2f-%.2f", lo, lo + 0.09)) -bin15_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.15, 0.95, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin15_levels -) -bin15_counts <- tibble(bin = factor(bin15_factor, levels = bin15_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# View -print(descriptives, n = Inf) -cat("\nBin counts (.10-.19, .20-.29, ..., .90-.99):\n") -print(bin_counts, n = Inf) -cat("\nBin counts (0.15-0.24, 0.25-0.34, ..., 0.85-0.94):\n") -print(bin15_counts, n = Inf) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918124915.r b/.history/eohi1/descriptives - gen knowledge questions_20250918124915.r deleted file mode 100644 index 25de7e7..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918124915.r +++ /dev/null @@ -1,88 +0,0 @@ -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read data -data <- read.csv("exp1.csv") - -# Select variables ending exactly with _T or _F -df <- data %>% select(matches("(_T|_F)$")) - -# Remove demo_f variable (if present) -df <- df %>% select(-any_of("demo_f")) - -str(df) - -# Coerce to numeric where possible (without breaking non-numeric) -df_num <- df %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Compute count and proportion correct per variable -descriptives <- purrr::imap_dfr(df_num, function(col, name) { - x <- suppressWarnings(as.numeric(col)) - x <- x[!is.na(x)] - n_total <- length(x) - n_correct <- if (n_total == 0) NA_integer_ else sum(x == 1) - prop <- if (n_total == 0) NA_real_ else n_correct / n_total - - # Extract difficulty number from variable name and map to expected range - difficulty_num <- as.numeric(gsub(".*_([0-9]+)_[TF]$", "\\1", name)) - expected_ranges <- list( - "15" = c(0.15, 0.25), - "35" = c(0.35, 0.45), - "55" = c(0.55, 0.65), - "75" = c(0.75, 0.85) - ) - - if (as.character(difficulty_num) %in% names(expected_ranges)) { - expected_range <- expected_ranges[[as.character(difficulty_num)]] - match_difficulty <- if (prop >= expected_range[1] && prop <= expected_range[2]) "YES" else "NO" - } else { - match_difficulty <- "UNKNOWN" - } - - tibble( - variable = name, - n_total = n_total, - n_correct = n_correct, - prop_correct = round(prop, 5), - match_difficulty = match_difficulty - ) -}) %>% - arrange(variable) - -# Bin proportions into .10-.19, .20-.29, ..., .90-.99 and count variables per bin -bin_levels <- sapply(1:9, function(k) sprintf("%.2f-%.2f", k / 10, k / 10 + 0.09)) -bin_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.10, 1.00, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin_levels -) -bin_counts <- tibble(bin = factor(bin_factor, levels = bin_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# Additional bins: 0.15-0.24, 0.25-0.34, ..., 0.85-0.94 -bin15_levels <- sapply(seq(0.15, 0.85, by = 0.10), function(lo) sprintf("%.2f-%.2f", lo, lo + 0.09)) -bin15_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.15, 0.95, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin15_levels -) -bin15_counts <- tibble(bin = factor(bin15_factor, levels = bin15_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# View -print(descriptives, n = Inf) -cat("\nBin counts (.10-.19, .20-.29, ..., .90-.99):\n") -print(bin_counts, n = Inf) -cat("\nBin counts (0.15-0.24, 0.25-0.34, ..., 0.85-0.94):\n") -print(bin15_counts, n = Inf) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918145603.r b/.history/eohi1/descriptives - gen knowledge questions_20250918145603.r deleted file mode 100644 index 16f3295..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918145603.r +++ /dev/null @@ -1,101 +0,0 @@ -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read data -data <- read.csv("exp1.csv") - -# Select variables ending exactly with _T or _F -df <- data %>% select(matches("(_T|_F)$")) - -# Remove demo_f variable (if present) -df <- df %>% select(-any_of("demo_f")) - -str(df) - -# Coerce to numeric where possible (without breaking non-numeric) -df_num <- df %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Compute count and proportion correct per variable -descriptives <- purrr::imap_dfr(df_num, function(col, name) { - x <- suppressWarnings(as.numeric(col)) - x <- x[!is.na(x)] - n_total <- length(x) - n_correct <- if (n_total == 0) NA_integer_ else sum(x == 1) - prop <- if (n_total == 0) NA_real_ else n_correct / n_total - - # Extract difficulty number from variable name and map to expected range - difficulty_num <- as.numeric(gsub(".*_([0-9]+)_[TF]$", "\\1", name)) - expected_ranges <- list( - "15" = c(0.15, 0.25), - "35" = c(0.35, 0.45), - "55" = c(0.55, 0.65), - "75" = c(0.75, 0.85) - ) - - if (as.character(difficulty_num) %in% names(expected_ranges)) { - expected_range <- expected_ranges[[as.character(difficulty_num)]] - match_difficulty <- if (prop >= expected_range[1] && prop <= expected_range[2]) "YES" else "NO" - } else { - match_difficulty <- "UNKNOWN" - } - - tibble( - variable = name, - n_total = n_total, - n_correct = n_correct, - prop_correct = round(prop, 5), - match_difficulty = match_difficulty - ) -}) %>% - arrange(variable) - -# Bin proportions into .10-.19, .20-.29, ..., .90-.99 and count variables per bin -bin_levels <- sapply(1:9, function(k) sprintf("%.2f-%.2f", k / 10, k / 10 + 0.09)) -bin_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.10, 1.00, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin_levels -) -bin_counts <- tibble(bin = factor(bin_factor, levels = bin_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# Additional bins: 0.15-0.24, 0.25-0.34, ..., 0.85-0.94 -bin15_levels <- sapply(seq(0.15, 0.85, by = 0.10), function(lo) sprintf("%.2f-%.2f", lo, lo + 0.09)) -bin15_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.15, 0.95, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin15_levels -) -bin15_counts <- tibble(bin = factor(bin15_factor, levels = bin15_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# View -print(descriptives, n = Inf) -cat("\nBin counts (.10-.19, .20-.29, ..., .90-.99):\n") -print(bin_counts, n = Inf) -cat("\nBin counts (0.15-0.24, 0.25-0.34, ..., 0.85-0.94):\n") -print(bin15_counts, n = Inf) - -# Histogram of proportion correct -histogram <- ggplot(descriptives, aes(x = prop_correct)) + - geom_histogram(binwidth = 0.05, fill = "lightblue", color = "black", alpha = 0.7) + - labs( - title = "Distribution of Proportion Correct", - x = "Proportion Correct", - y = "Number of Variables" - ) + - theme_minimal() + - scale_x_continuous(breaks = seq(0, 1, 0.1)) - -print(histogram) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918145606.r b/.history/eohi1/descriptives - gen knowledge questions_20250918145606.r deleted file mode 100644 index 16f3295..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918145606.r +++ /dev/null @@ -1,101 +0,0 @@ -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read data -data <- read.csv("exp1.csv") - -# Select variables ending exactly with _T or _F -df <- data %>% select(matches("(_T|_F)$")) - -# Remove demo_f variable (if present) -df <- df %>% select(-any_of("demo_f")) - -str(df) - -# Coerce to numeric where possible (without breaking non-numeric) -df_num <- df %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Compute count and proportion correct per variable -descriptives <- purrr::imap_dfr(df_num, function(col, name) { - x <- suppressWarnings(as.numeric(col)) - x <- x[!is.na(x)] - n_total <- length(x) - n_correct <- if (n_total == 0) NA_integer_ else sum(x == 1) - prop <- if (n_total == 0) NA_real_ else n_correct / n_total - - # Extract difficulty number from variable name and map to expected range - difficulty_num <- as.numeric(gsub(".*_([0-9]+)_[TF]$", "\\1", name)) - expected_ranges <- list( - "15" = c(0.15, 0.25), - "35" = c(0.35, 0.45), - "55" = c(0.55, 0.65), - "75" = c(0.75, 0.85) - ) - - if (as.character(difficulty_num) %in% names(expected_ranges)) { - expected_range <- expected_ranges[[as.character(difficulty_num)]] - match_difficulty <- if (prop >= expected_range[1] && prop <= expected_range[2]) "YES" else "NO" - } else { - match_difficulty <- "UNKNOWN" - } - - tibble( - variable = name, - n_total = n_total, - n_correct = n_correct, - prop_correct = round(prop, 5), - match_difficulty = match_difficulty - ) -}) %>% - arrange(variable) - -# Bin proportions into .10-.19, .20-.29, ..., .90-.99 and count variables per bin -bin_levels <- sapply(1:9, function(k) sprintf("%.2f-%.2f", k / 10, k / 10 + 0.09)) -bin_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.10, 1.00, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin_levels -) -bin_counts <- tibble(bin = factor(bin_factor, levels = bin_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# Additional bins: 0.15-0.24, 0.25-0.34, ..., 0.85-0.94 -bin15_levels <- sapply(seq(0.15, 0.85, by = 0.10), function(lo) sprintf("%.2f-%.2f", lo, lo + 0.09)) -bin15_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.15, 0.95, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin15_levels -) -bin15_counts <- tibble(bin = factor(bin15_factor, levels = bin15_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# View -print(descriptives, n = Inf) -cat("\nBin counts (.10-.19, .20-.29, ..., .90-.99):\n") -print(bin_counts, n = Inf) -cat("\nBin counts (0.15-0.24, 0.25-0.34, ..., 0.85-0.94):\n") -print(bin15_counts, n = Inf) - -# Histogram of proportion correct -histogram <- ggplot(descriptives, aes(x = prop_correct)) + - geom_histogram(binwidth = 0.05, fill = "lightblue", color = "black", alpha = 0.7) + - labs( - title = "Distribution of Proportion Correct", - x = "Proportion Correct", - y = "Number of Variables" - ) + - theme_minimal() + - scale_x_continuous(breaks = seq(0, 1, 0.1)) - -print(histogram) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918145728.r b/.history/eohi1/descriptives - gen knowledge questions_20250918145728.r deleted file mode 100644 index 16f3295..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918145728.r +++ /dev/null @@ -1,101 +0,0 @@ -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read data -data <- read.csv("exp1.csv") - -# Select variables ending exactly with _T or _F -df <- data %>% select(matches("(_T|_F)$")) - -# Remove demo_f variable (if present) -df <- df %>% select(-any_of("demo_f")) - -str(df) - -# Coerce to numeric where possible (without breaking non-numeric) -df_num <- df %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Compute count and proportion correct per variable -descriptives <- purrr::imap_dfr(df_num, function(col, name) { - x <- suppressWarnings(as.numeric(col)) - x <- x[!is.na(x)] - n_total <- length(x) - n_correct <- if (n_total == 0) NA_integer_ else sum(x == 1) - prop <- if (n_total == 0) NA_real_ else n_correct / n_total - - # Extract difficulty number from variable name and map to expected range - difficulty_num <- as.numeric(gsub(".*_([0-9]+)_[TF]$", "\\1", name)) - expected_ranges <- list( - "15" = c(0.15, 0.25), - "35" = c(0.35, 0.45), - "55" = c(0.55, 0.65), - "75" = c(0.75, 0.85) - ) - - if (as.character(difficulty_num) %in% names(expected_ranges)) { - expected_range <- expected_ranges[[as.character(difficulty_num)]] - match_difficulty <- if (prop >= expected_range[1] && prop <= expected_range[2]) "YES" else "NO" - } else { - match_difficulty <- "UNKNOWN" - } - - tibble( - variable = name, - n_total = n_total, - n_correct = n_correct, - prop_correct = round(prop, 5), - match_difficulty = match_difficulty - ) -}) %>% - arrange(variable) - -# Bin proportions into .10-.19, .20-.29, ..., .90-.99 and count variables per bin -bin_levels <- sapply(1:9, function(k) sprintf("%.2f-%.2f", k / 10, k / 10 + 0.09)) -bin_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.10, 1.00, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin_levels -) -bin_counts <- tibble(bin = factor(bin_factor, levels = bin_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# Additional bins: 0.15-0.24, 0.25-0.34, ..., 0.85-0.94 -bin15_levels <- sapply(seq(0.15, 0.85, by = 0.10), function(lo) sprintf("%.2f-%.2f", lo, lo + 0.09)) -bin15_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.15, 0.95, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin15_levels -) -bin15_counts <- tibble(bin = factor(bin15_factor, levels = bin15_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# View -print(descriptives, n = Inf) -cat("\nBin counts (.10-.19, .20-.29, ..., .90-.99):\n") -print(bin_counts, n = Inf) -cat("\nBin counts (0.15-0.24, 0.25-0.34, ..., 0.85-0.94):\n") -print(bin15_counts, n = Inf) - -# Histogram of proportion correct -histogram <- ggplot(descriptives, aes(x = prop_correct)) + - geom_histogram(binwidth = 0.05, fill = "lightblue", color = "black", alpha = 0.7) + - labs( - title = "Distribution of Proportion Correct", - x = "Proportion Correct", - y = "Number of Variables" - ) + - theme_minimal() + - scale_x_continuous(breaks = seq(0, 1, 0.1)) - -print(histogram) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918155602.r b/.history/eohi1/descriptives - gen knowledge questions_20250918155602.r deleted file mode 100644 index 6d7be10..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918155602.r +++ /dev/null @@ -1,106 +0,0 @@ -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read data -data <- read.csv("exp1.csv") - -# Select variables ending exactly with _T or _F -df <- data %>% select(matches("(_T|_F)$")) - -# Remove demo_f variable (if present) -df <- df %>% select(-any_of("demo_f")) - -str(df) - -# Coerce to numeric where possible (without breaking non-numeric) -df_num <- df %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Compute count and proportion correct per variable -descriptives <- purrr::imap_dfr(df_num, function(col, name) { - x <- suppressWarnings(as.numeric(col)) - x <- x[!is.na(x)] - n_total <- length(x) - n_correct <- if (n_total == 0) NA_integer_ else sum(x == 1) - prop <- if (n_total == 0) NA_real_ else n_correct / n_total - - # Extract difficulty number from variable name and map to expected range - difficulty_num <- as.numeric(gsub(".*_([0-9]+)_[TF]$", "\\1", name)) - expected_ranges <- list( - "15" = c(0.15, 0.25), - "35" = c(0.35, 0.45), - "55" = c(0.55, 0.65), - "75" = c(0.75, 0.85) - ) - - if (as.character(difficulty_num) %in% names(expected_ranges)) { - expected_range <- expected_ranges[[as.character(difficulty_num)]] - match_difficulty <- if (prop >= expected_range[1] && prop <= expected_range[2]) "YES" else "NO" - } else { - match_difficulty <- "UNKNOWN" - } - - tibble( - variable = name, - n_total = n_total, - n_correct = n_correct, - prop_correct = round(prop, 5), - match_difficulty = match_difficulty - ) -}) %>% - arrange(variable) - -# Bin proportions into .10-.19, .20-.29, ..., .90-.99 and count variables per bin -bin_levels <- sapply(1:9, function(k) sprintf("%.2f-%.2f", k / 10, k / 10 + 0.09)) -bin_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.10, 1.00, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin_levels -) -bin_counts <- tibble(bin = factor(bin_factor, levels = bin_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# Additional bins: 0.15-0.24, 0.25-0.34, ..., 0.85-0.94 -bin15_levels <- sapply(seq(0.15, 0.85, by = 0.10), function(lo) sprintf("%.2f-%.2f", lo, lo + 0.09)) -bin15_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.15, 0.95, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin15_levels -) -bin15_counts <- tibble(bin = factor(bin15_factor, levels = bin15_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# View -print(descriptives, n = Inf) -cat("\nBin counts (.10-.19, .20-.29, ..., .90-.99):\n") -print(bin_counts, n = Inf) -cat("\nBin counts (0.15-0.24, 0.25-0.34, ..., 0.85-0.94):\n") -print(bin15_counts, n = Inf) - -# Histogram of proportion correct with custom bins -histogram <- ggplot(descriptives, aes(x = prop_correct)) + - geom_histogram( - breaks = seq(0.15, 0.95, by = 0.10), - fill = "lightblue", - color = "black", - alpha = 0.7 - ) + - labs( - title = "Distribution of Proportion Correct", - x = "Proportion Correct", - y = "Number of Variables" - ) + - theme_minimal() + - scale_x_continuous(breaks = seq(0.15, 0.95, by = 0.10)) - -print(histogram) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918155604.r b/.history/eohi1/descriptives - gen knowledge questions_20250918155604.r deleted file mode 100644 index 6d7be10..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918155604.r +++ /dev/null @@ -1,106 +0,0 @@ -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read data -data <- read.csv("exp1.csv") - -# Select variables ending exactly with _T or _F -df <- data %>% select(matches("(_T|_F)$")) - -# Remove demo_f variable (if present) -df <- df %>% select(-any_of("demo_f")) - -str(df) - -# Coerce to numeric where possible (without breaking non-numeric) -df_num <- df %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Compute count and proportion correct per variable -descriptives <- purrr::imap_dfr(df_num, function(col, name) { - x <- suppressWarnings(as.numeric(col)) - x <- x[!is.na(x)] - n_total <- length(x) - n_correct <- if (n_total == 0) NA_integer_ else sum(x == 1) - prop <- if (n_total == 0) NA_real_ else n_correct / n_total - - # Extract difficulty number from variable name and map to expected range - difficulty_num <- as.numeric(gsub(".*_([0-9]+)_[TF]$", "\\1", name)) - expected_ranges <- list( - "15" = c(0.15, 0.25), - "35" = c(0.35, 0.45), - "55" = c(0.55, 0.65), - "75" = c(0.75, 0.85) - ) - - if (as.character(difficulty_num) %in% names(expected_ranges)) { - expected_range <- expected_ranges[[as.character(difficulty_num)]] - match_difficulty <- if (prop >= expected_range[1] && prop <= expected_range[2]) "YES" else "NO" - } else { - match_difficulty <- "UNKNOWN" - } - - tibble( - variable = name, - n_total = n_total, - n_correct = n_correct, - prop_correct = round(prop, 5), - match_difficulty = match_difficulty - ) -}) %>% - arrange(variable) - -# Bin proportions into .10-.19, .20-.29, ..., .90-.99 and count variables per bin -bin_levels <- sapply(1:9, function(k) sprintf("%.2f-%.2f", k / 10, k / 10 + 0.09)) -bin_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.10, 1.00, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin_levels -) -bin_counts <- tibble(bin = factor(bin_factor, levels = bin_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# Additional bins: 0.15-0.24, 0.25-0.34, ..., 0.85-0.94 -bin15_levels <- sapply(seq(0.15, 0.85, by = 0.10), function(lo) sprintf("%.2f-%.2f", lo, lo + 0.09)) -bin15_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.15, 0.95, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin15_levels -) -bin15_counts <- tibble(bin = factor(bin15_factor, levels = bin15_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# View -print(descriptives, n = Inf) -cat("\nBin counts (.10-.19, .20-.29, ..., .90-.99):\n") -print(bin_counts, n = Inf) -cat("\nBin counts (0.15-0.24, 0.25-0.34, ..., 0.85-0.94):\n") -print(bin15_counts, n = Inf) - -# Histogram of proportion correct with custom bins -histogram <- ggplot(descriptives, aes(x = prop_correct)) + - geom_histogram( - breaks = seq(0.15, 0.95, by = 0.10), - fill = "lightblue", - color = "black", - alpha = 0.7 - ) + - labs( - title = "Distribution of Proportion Correct", - x = "Proportion Correct", - y = "Number of Variables" - ) + - theme_minimal() + - scale_x_continuous(breaks = seq(0.15, 0.95, by = 0.10)) - -print(histogram) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918155605.r b/.history/eohi1/descriptives - gen knowledge questions_20250918155605.r deleted file mode 100644 index 6d7be10..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918155605.r +++ /dev/null @@ -1,106 +0,0 @@ -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read data -data <- read.csv("exp1.csv") - -# Select variables ending exactly with _T or _F -df <- data %>% select(matches("(_T|_F)$")) - -# Remove demo_f variable (if present) -df <- df %>% select(-any_of("demo_f")) - -str(df) - -# Coerce to numeric where possible (without breaking non-numeric) -df_num <- df %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Compute count and proportion correct per variable -descriptives <- purrr::imap_dfr(df_num, function(col, name) { - x <- suppressWarnings(as.numeric(col)) - x <- x[!is.na(x)] - n_total <- length(x) - n_correct <- if (n_total == 0) NA_integer_ else sum(x == 1) - prop <- if (n_total == 0) NA_real_ else n_correct / n_total - - # Extract difficulty number from variable name and map to expected range - difficulty_num <- as.numeric(gsub(".*_([0-9]+)_[TF]$", "\\1", name)) - expected_ranges <- list( - "15" = c(0.15, 0.25), - "35" = c(0.35, 0.45), - "55" = c(0.55, 0.65), - "75" = c(0.75, 0.85) - ) - - if (as.character(difficulty_num) %in% names(expected_ranges)) { - expected_range <- expected_ranges[[as.character(difficulty_num)]] - match_difficulty <- if (prop >= expected_range[1] && prop <= expected_range[2]) "YES" else "NO" - } else { - match_difficulty <- "UNKNOWN" - } - - tibble( - variable = name, - n_total = n_total, - n_correct = n_correct, - prop_correct = round(prop, 5), - match_difficulty = match_difficulty - ) -}) %>% - arrange(variable) - -# Bin proportions into .10-.19, .20-.29, ..., .90-.99 and count variables per bin -bin_levels <- sapply(1:9, function(k) sprintf("%.2f-%.2f", k / 10, k / 10 + 0.09)) -bin_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.10, 1.00, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin_levels -) -bin_counts <- tibble(bin = factor(bin_factor, levels = bin_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# Additional bins: 0.15-0.24, 0.25-0.34, ..., 0.85-0.94 -bin15_levels <- sapply(seq(0.15, 0.85, by = 0.10), function(lo) sprintf("%.2f-%.2f", lo, lo + 0.09)) -bin15_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.15, 0.95, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin15_levels -) -bin15_counts <- tibble(bin = factor(bin15_factor, levels = bin15_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# View -print(descriptives, n = Inf) -cat("\nBin counts (.10-.19, .20-.29, ..., .90-.99):\n") -print(bin_counts, n = Inf) -cat("\nBin counts (0.15-0.24, 0.25-0.34, ..., 0.85-0.94):\n") -print(bin15_counts, n = Inf) - -# Histogram of proportion correct with custom bins -histogram <- ggplot(descriptives, aes(x = prop_correct)) + - geom_histogram( - breaks = seq(0.15, 0.95, by = 0.10), - fill = "lightblue", - color = "black", - alpha = 0.7 - ) + - labs( - title = "Distribution of Proportion Correct", - x = "Proportion Correct", - y = "Number of Variables" - ) + - theme_minimal() + - scale_x_continuous(breaks = seq(0.15, 0.95, by = 0.10)) - -print(histogram) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/descriptives - gen knowledge questions_20250918155636.r b/.history/eohi1/descriptives - gen knowledge questions_20250918155636.r deleted file mode 100644 index 2932a3b..0000000 --- a/.history/eohi1/descriptives - gen knowledge questions_20250918155636.r +++ /dev/null @@ -1,107 +0,0 @@ -library(tidyverse) -library(ggplot2) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read data -data <- read.csv("exp1.csv") - -# Select variables ending exactly with _T or _F -df <- data %>% select(matches("(_T|_F)$")) - -# Remove demo_f variable (if present) -df <- df %>% select(-any_of("demo_f")) - -str(df) - -# Coerce to numeric where possible (without breaking non-numeric) -df_num <- df %>% - mutate(across(everything(), ~ suppressWarnings(as.numeric(.)))) - -# Compute count and proportion correct per variable -descriptives <- purrr::imap_dfr(df_num, function(col, name) { - x <- suppressWarnings(as.numeric(col)) - x <- x[!is.na(x)] - n_total <- length(x) - n_correct <- if (n_total == 0) NA_integer_ else sum(x == 1) - prop <- if (n_total == 0) NA_real_ else n_correct / n_total - - # Extract difficulty number from variable name and map to expected range - difficulty_num <- as.numeric(gsub(".*_([0-9]+)_[TF]$", "\\1", name)) - expected_ranges <- list( - "15" = c(0.15, 0.25), - "35" = c(0.35, 0.45), - "55" = c(0.55, 0.65), - "75" = c(0.75, 0.85) - ) - - if (as.character(difficulty_num) %in% names(expected_ranges)) { - expected_range <- expected_ranges[[as.character(difficulty_num)]] - match_difficulty <- if (prop >= expected_range[1] && prop <= expected_range[2]) "YES" else "NO" - } else { - match_difficulty <- "UNKNOWN" - } - - tibble( - variable = name, - n_total = n_total, - n_correct = n_correct, - prop_correct = round(prop, 5), - match_difficulty = match_difficulty - ) -}) %>% - arrange(variable) - -# Bin proportions into .10-.19, .20-.29, ..., .90-.99 and count variables per bin -bin_levels <- sapply(1:9, function(k) sprintf("%.2f-%.2f", k / 10, k / 10 + 0.09)) -bin_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.10, 1.00, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin_levels -) -bin_counts <- tibble(bin = factor(bin_factor, levels = bin_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# Additional bins: 0.15-0.24, 0.25-0.34, ..., 0.85-0.94 -bin15_levels <- sapply(seq(0.15, 0.85, by = 0.10), function(lo) sprintf("%.2f-%.2f", lo, lo + 0.09)) -bin15_factor <- cut( - descriptives$prop_correct, - breaks = seq(0.15, 0.95, by = 0.10), - right = FALSE, - include.lowest = FALSE, - labels = bin15_levels -) -bin15_counts <- tibble(bin = factor(bin15_factor, levels = bin15_levels)) %>% - group_by(bin) %>% - summarise(num_variables = n(), .groups = "drop") - -# View -print(descriptives, n = Inf) -cat("\nBin counts (.10-.19, .20-.29, ..., .90-.99):\n") -print(bin_counts, n = Inf) -cat("\nBin counts (0.15-0.24, 0.25-0.34, ..., 0.85-0.94):\n") -print(bin15_counts, n = Inf) - -# Histogram of proportion correct with custom bins -histogram <- ggplot(descriptives, aes(x = prop_correct)) + - geom_histogram( - breaks = seq(0.15, 0.95, by = 0.10), - fill = "lightblue", - color = "black", - alpha = 0.7 - ) + - labs( - title = "Distribution of Proportion Correct", - x = "Proportion Correct", - y = "Number of Variables" - ) + - theme_minimal() + - scale_x_continuous(breaks = seq(0.15, 0.95, by = 0.10)) - -print(histogram) - -# Optionally save -# readr::write_csv(descriptives, "exp1_TF_descriptives.csv") \ No newline at end of file diff --git a/.history/eohi1/e1 - reliability ehi_20251029093310.r b/.history/eohi1/e1 - reliability ehi_20251029093310.r deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi1/e1 - reliability ehi_20251029093311.r b/.history/eohi1/e1 - reliability ehi_20251029093311.r deleted file mode 100644 index d059220..0000000 --- a/.history/eohi1/e1 - reliability ehi_20251029093311.r +++ /dev/null @@ -1,56 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(knitr) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")]), - c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - - "

Item-Level Statistics

", - kable(alpha_result$item.stats, format = "html"), - - "

Alpha if Item Dropped

", - kable(alpha_dropped$alpha.drop, format = "html"), - - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi1/e1 - reliability ehi_20251029093545.r b/.history/eohi1/e1 - reliability ehi_20251029093545.r deleted file mode 100644 index e9a46df..0000000 --- a/.history/eohi1/e1 - reliability ehi_20251029093545.r +++ /dev/null @@ -1,56 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -options(scipen = 999) - -df <- read.csv("ehi1.csv") -library(psych) -library(knitr) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("eohiDGEN_mean", "ehi_global_mean")]), - c("eohiDGEN_mean", "ehi_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - - "

Item-Level Statistics

", - kable(alpha_result$item.stats, format = "html"), - - "

Alpha if Item Dropped

", - kable(alpha_dropped$alpha.drop, format = "html"), - - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi1/e1 - reliability ehi_20251029094220.r b/.history/eohi1/e1 - reliability ehi_20251029094220.r deleted file mode 100644 index cae6d8c..0000000 --- a/.history/eohi1/e1 - reliability ehi_20251029094220.r +++ /dev/null @@ -1,80 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -options(scipen = 999) - -df <- read.csv("ehi1.csv") -library(psych) -library(knitr) - -# fixed-decimal formatter (five decimals) -fmt5 <- function(x) formatC(x, format = "f", digits = 5) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("eohiDGEN_mean", "ehi_global_mean")]), - c("eohiDGEN_mean", "ehi_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Two-item reliability summary (if applicable) -two_item_section <- "" -if (ncol(reliability_data) == 2) { - n_complete <- sum(complete.cases(reliability_data)) - r_pearson <- cor(reliability_data[, 1], reliability_data[, 2], use = "complete.obs", method = "pearson") - # Fisher z CI for r - fisher_z <- atanh(r_pearson) - se_z <- 1 / sqrt(n_complete - 3) - z_crit <- qnorm(0.975) - ci_z_lower <- fisher_z - z_crit * se_z - ci_z_upper <- fisher_z + z_crit * se_z - ci_r_lower <- tanh(ci_z_lower) - ci_r_upper <- tanh(ci_z_upper) - # Spearman–Brown/Cronbach's alpha for k = 2 - alpha_sb <- (2 * r_pearson) / (1 + r_pearson) - alpha_lower <- (2 * ci_r_lower) / (1 + ci_r_lower) - alpha_upper <- (2 * ci_r_upper) / (1 + ci_r_upper) - - two_item_section <- paste0( - "

Two-Item Reliability Summary

", - "

Pearson r: ", fmt5(r_pearson), " (95% CI: [", fmt5(ci_r_lower), ", ", fmt5(ci_r_upper), "])

", - "

Spearman–Brown / Cronbach's ", intToUtf8(945), ": ", fmt5(alpha_sb), - " (95% CI: [", fmt5(alpha_lower), ", ", fmt5(alpha_upper), "])

" - ) -} - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - two_item_section, - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", fmt5(split_half$maxrb), "

", - - "

Item-Level Statistics

", - kable(alpha_result$item.stats, format = "html"), - - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi1/e1 - reliability ehi_20251029094235.r b/.history/eohi1/e1 - reliability ehi_20251029094235.r deleted file mode 100644 index cae6d8c..0000000 --- a/.history/eohi1/e1 - reliability ehi_20251029094235.r +++ /dev/null @@ -1,80 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -options(scipen = 999) - -df <- read.csv("ehi1.csv") -library(psych) -library(knitr) - -# fixed-decimal formatter (five decimals) -fmt5 <- function(x) formatC(x, format = "f", digits = 5) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("eohiDGEN_mean", "ehi_global_mean")]), - c("eohiDGEN_mean", "ehi_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Two-item reliability summary (if applicable) -two_item_section <- "" -if (ncol(reliability_data) == 2) { - n_complete <- sum(complete.cases(reliability_data)) - r_pearson <- cor(reliability_data[, 1], reliability_data[, 2], use = "complete.obs", method = "pearson") - # Fisher z CI for r - fisher_z <- atanh(r_pearson) - se_z <- 1 / sqrt(n_complete - 3) - z_crit <- qnorm(0.975) - ci_z_lower <- fisher_z - z_crit * se_z - ci_z_upper <- fisher_z + z_crit * se_z - ci_r_lower <- tanh(ci_z_lower) - ci_r_upper <- tanh(ci_z_upper) - # Spearman–Brown/Cronbach's alpha for k = 2 - alpha_sb <- (2 * r_pearson) / (1 + r_pearson) - alpha_lower <- (2 * ci_r_lower) / (1 + ci_r_lower) - alpha_upper <- (2 * ci_r_upper) / (1 + ci_r_upper) - - two_item_section <- paste0( - "

Two-Item Reliability Summary

", - "

Pearson r: ", fmt5(r_pearson), " (95% CI: [", fmt5(ci_r_lower), ", ", fmt5(ci_r_upper), "])

", - "

Spearman–Brown / Cronbach's ", intToUtf8(945), ": ", fmt5(alpha_sb), - " (95% CI: [", fmt5(alpha_lower), ", ", fmt5(alpha_upper), "])

" - ) -} - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - two_item_section, - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", fmt5(split_half$maxrb), "

", - - "

Item-Level Statistics

", - kable(alpha_result$item.stats, format = "html"), - - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi1/e1 - reliability ehi_20251029094336.r b/.history/eohi1/e1 - reliability ehi_20251029094336.r deleted file mode 100644 index adbc326..0000000 --- a/.history/eohi1/e1 - reliability ehi_20251029094336.r +++ /dev/null @@ -1,88 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -options(scipen = 999) - -df <- read.csv("ehi1.csv") -library(psych) -library(knitr) - -# fixed-decimal formatter (five decimals) -fmt5 <- function(x) formatC(x, format = "f", digits = 5) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("eohiDGEN_mean", "ehi_global_mean")]), - c("eohiDGEN_mean", "ehi_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Two-item reliability summary (if applicable) -two_item_section <- "" -if (ncol(reliability_data) == 2) { - n_complete <- sum(complete.cases(reliability_data)) - r_pearson <- cor(reliability_data[, 1], reliability_data[, 2], use = "complete.obs", method = "pearson") - r_spearman <- suppressWarnings(cor(reliability_data[, 1], reliability_data[, 2], use = "complete.obs", method = "spearman")) - # Fisher z CI for r - fisher_z <- atanh(r_pearson) - se_z <- 1 / sqrt(n_complete - 3) - z_crit <- qnorm(0.975) - ci_z_lower <- fisher_z - z_crit * se_z - ci_z_upper <- fisher_z + z_crit * se_z - ci_r_lower <- tanh(ci_z_lower) - ci_r_upper <- tanh(ci_z_upper) - # Approximate Fisher z CI for Spearman rho (large-sample approximation) - fisher_z_s <- atanh(r_spearman) - ci_zs_lower <- fisher_z_s - z_crit * se_z - ci_zs_upper <- fisher_z_s + z_crit * se_z - ci_s_lower <- tanh(ci_zs_lower) - ci_s_upper <- tanh(ci_zs_upper) - # Spearman–Brown/Cronbach's alpha for k = 2 - alpha_sb <- (2 * r_pearson) / (1 + r_pearson) - alpha_lower <- (2 * ci_r_lower) / (1 + ci_r_lower) - alpha_upper <- (2 * ci_r_upper) / (1 + ci_r_upper) - - two_item_section <- paste0( - "

Two-Item Reliability Summary

", - "

Pearson r: ", fmt5(r_pearson), " (95% CI: [", fmt5(ci_r_lower), ", ", fmt5(ci_r_upper), "])

", - "

Spearman r: ", fmt5(r_spearman), " (95% CI: [", fmt5(ci_s_lower), ", ", fmt5(ci_s_upper), "])

", - "

Spearman–Brown / Cronbach's ", intToUtf8(945), ": ", fmt5(alpha_sb), - " (95% CI: [", fmt5(alpha_lower), ", ", fmt5(alpha_upper), "])

" - ) -} - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - two_item_section, - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", fmt5(split_half$maxrb), "

", - - "

Item-Level Statistics

", - kable(alpha_result$item.stats, format = "html"), - - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi1/e1 - reliability ehi_20251029094344.r b/.history/eohi1/e1 - reliability ehi_20251029094344.r deleted file mode 100644 index adbc326..0000000 --- a/.history/eohi1/e1 - reliability ehi_20251029094344.r +++ /dev/null @@ -1,88 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -options(scipen = 999) - -df <- read.csv("ehi1.csv") -library(psych) -library(knitr) - -# fixed-decimal formatter (five decimals) -fmt5 <- function(x) formatC(x, format = "f", digits = 5) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("eohiDGEN_mean", "ehi_global_mean")]), - c("eohiDGEN_mean", "ehi_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Two-item reliability summary (if applicable) -two_item_section <- "" -if (ncol(reliability_data) == 2) { - n_complete <- sum(complete.cases(reliability_data)) - r_pearson <- cor(reliability_data[, 1], reliability_data[, 2], use = "complete.obs", method = "pearson") - r_spearman <- suppressWarnings(cor(reliability_data[, 1], reliability_data[, 2], use = "complete.obs", method = "spearman")) - # Fisher z CI for r - fisher_z <- atanh(r_pearson) - se_z <- 1 / sqrt(n_complete - 3) - z_crit <- qnorm(0.975) - ci_z_lower <- fisher_z - z_crit * se_z - ci_z_upper <- fisher_z + z_crit * se_z - ci_r_lower <- tanh(ci_z_lower) - ci_r_upper <- tanh(ci_z_upper) - # Approximate Fisher z CI for Spearman rho (large-sample approximation) - fisher_z_s <- atanh(r_spearman) - ci_zs_lower <- fisher_z_s - z_crit * se_z - ci_zs_upper <- fisher_z_s + z_crit * se_z - ci_s_lower <- tanh(ci_zs_lower) - ci_s_upper <- tanh(ci_zs_upper) - # Spearman–Brown/Cronbach's alpha for k = 2 - alpha_sb <- (2 * r_pearson) / (1 + r_pearson) - alpha_lower <- (2 * ci_r_lower) / (1 + ci_r_lower) - alpha_upper <- (2 * ci_r_upper) / (1 + ci_r_upper) - - two_item_section <- paste0( - "

Two-Item Reliability Summary

", - "

Pearson r: ", fmt5(r_pearson), " (95% CI: [", fmt5(ci_r_lower), ", ", fmt5(ci_r_upper), "])

", - "

Spearman r: ", fmt5(r_spearman), " (95% CI: [", fmt5(ci_s_lower), ", ", fmt5(ci_s_upper), "])

", - "

Spearman–Brown / Cronbach's ", intToUtf8(945), ": ", fmt5(alpha_sb), - " (95% CI: [", fmt5(alpha_lower), ", ", fmt5(alpha_upper), "])

" - ) -} - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - two_item_section, - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", fmt5(split_half$maxrb), "

", - - "

Item-Level Statistics

", - kable(alpha_result$item.stats, format = "html"), - - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi1/e1 - reliability ehi_20251029094408.r b/.history/eohi1/e1 - reliability ehi_20251029094408.r deleted file mode 100644 index adbc326..0000000 --- a/.history/eohi1/e1 - reliability ehi_20251029094408.r +++ /dev/null @@ -1,88 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -options(scipen = 999) - -df <- read.csv("ehi1.csv") -library(psych) -library(knitr) - -# fixed-decimal formatter (five decimals) -fmt5 <- function(x) formatC(x, format = "f", digits = 5) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("eohiDGEN_mean", "ehi_global_mean")]), - c("eohiDGEN_mean", "ehi_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Two-item reliability summary (if applicable) -two_item_section <- "" -if (ncol(reliability_data) == 2) { - n_complete <- sum(complete.cases(reliability_data)) - r_pearson <- cor(reliability_data[, 1], reliability_data[, 2], use = "complete.obs", method = "pearson") - r_spearman <- suppressWarnings(cor(reliability_data[, 1], reliability_data[, 2], use = "complete.obs", method = "spearman")) - # Fisher z CI for r - fisher_z <- atanh(r_pearson) - se_z <- 1 / sqrt(n_complete - 3) - z_crit <- qnorm(0.975) - ci_z_lower <- fisher_z - z_crit * se_z - ci_z_upper <- fisher_z + z_crit * se_z - ci_r_lower <- tanh(ci_z_lower) - ci_r_upper <- tanh(ci_z_upper) - # Approximate Fisher z CI for Spearman rho (large-sample approximation) - fisher_z_s <- atanh(r_spearman) - ci_zs_lower <- fisher_z_s - z_crit * se_z - ci_zs_upper <- fisher_z_s + z_crit * se_z - ci_s_lower <- tanh(ci_zs_lower) - ci_s_upper <- tanh(ci_zs_upper) - # Spearman–Brown/Cronbach's alpha for k = 2 - alpha_sb <- (2 * r_pearson) / (1 + r_pearson) - alpha_lower <- (2 * ci_r_lower) / (1 + ci_r_lower) - alpha_upper <- (2 * ci_r_upper) / (1 + ci_r_upper) - - two_item_section <- paste0( - "

Two-Item Reliability Summary

", - "

Pearson r: ", fmt5(r_pearson), " (95% CI: [", fmt5(ci_r_lower), ", ", fmt5(ci_r_upper), "])

", - "

Spearman r: ", fmt5(r_spearman), " (95% CI: [", fmt5(ci_s_lower), ", ", fmt5(ci_s_upper), "])

", - "

Spearman–Brown / Cronbach's ", intToUtf8(945), ": ", fmt5(alpha_sb), - " (95% CI: [", fmt5(alpha_lower), ", ", fmt5(alpha_upper), "])

" - ) -} - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - two_item_section, - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", fmt5(split_half$maxrb), "

", - - "

Item-Level Statistics

", - kable(alpha_result$item.stats, format = "html"), - - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi1/minimal_test_20251004194428.rmd b/.history/eohi1/minimal_test_20251004194428.rmd deleted file mode 100644 index 5a984d7..0000000 --- a/.history/eohi1/minimal_test_20251004194428.rmd +++ /dev/null @@ -1,16 +0,0 @@ ---- -title: "Minimal Test" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE) -``` - -# Test - -```{r} -library(tidyverse) -data <- read.csv("exp1.csv") -print("Success!") -``` diff --git a/.history/eohi1/minimal_test_20251004194431.rmd b/.history/eohi1/minimal_test_20251004194431.rmd deleted file mode 100644 index 5a984d7..0000000 --- a/.history/eohi1/minimal_test_20251004194431.rmd +++ /dev/null @@ -1,16 +0,0 @@ ---- -title: "Minimal Test" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE) -``` - -# Test - -```{r} -library(tidyverse) -data <- read.csv("exp1.csv") -print("Success!") -``` diff --git a/.history/eohi1/minimal_test_20251004194608.rmd b/.history/eohi1/minimal_test_20251004194608.rmd deleted file mode 100644 index 98fc5a2..0000000 --- a/.history/eohi1/minimal_test_20251004194608.rmd +++ /dev/null @@ -1,17 +0,0 @@ ---- -title: "Minimal Test" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = FALSE, warning = FALSE, message = FALSE) -``` - -# Test - -```{r} -library(tidyverse) -data <- read.csv("exp1.csv") -print("Success!") -``` - diff --git a/.history/eohi1/minimal_test_20251004194638.rmd b/.history/eohi1/minimal_test_20251004194638.rmd deleted file mode 100644 index 0519ecb..0000000 --- a/.history/eohi1/minimal_test_20251004194638.rmd +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251003132154.r b/.history/eohi1/mixed anova - DGEN_20251003132154.r deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi1/mixed anova - DGEN_20251003132235.r b/.history/eohi1/mixed anova - DGEN_20251003132235.r deleted file mode 100644 index f497db6..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251003132235.r +++ /dev/null @@ -1,21 +0,0 @@ -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251003132528.r b/.history/eohi1/mixed anova - DGEN_20251003132528.r deleted file mode 100644 index 4a07dbe..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251003132528.r +++ /dev/null @@ -1,671 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251003132534.r b/.history/eohi1/mixed anova - DGEN_20251003132534.r deleted file mode 100644 index 4a07dbe..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251003132534.r +++ /dev/null @@ -1,671 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251003132751.r b/.history/eohi1/mixed anova - DGEN_20251003132751.r deleted file mode 100644 index 4a07dbe..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251003132751.r +++ /dev/null @@ -1,671 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006125959.r b/.history/eohi1/mixed anova - DGEN_20251006125959.r deleted file mode 100644 index 4a07dbe..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006125959.r +++ /dev/null @@ -1,671 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006150203.r b/.history/eohi1/mixed anova - DGEN_20251006150203.r deleted file mode 100644 index 2fa76d8..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006150203.r +++ /dev/null @@ -1,673 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# interaction plots \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006150325.r b/.history/eohi1/mixed anova - DGEN_20251006150325.r deleted file mode 100644 index ac79773..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006150325.r +++ /dev/null @@ -1,667 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# interaction plots \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006150338.r b/.history/eohi1/mixed anova - DGEN_20251006150338.r deleted file mode 100644 index e4c5f8c..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006150338.r +++ /dev/null @@ -1,662 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# interaction plots \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006150343.r b/.history/eohi1/mixed anova - DGEN_20251006150343.r deleted file mode 100644 index 04f6eff..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006150343.r +++ /dev/null @@ -1,661 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# interaction plots \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006150351.r b/.history/eohi1/mixed anova - DGEN_20251006150351.r deleted file mode 100644 index 7576ff2..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006150351.r +++ /dev/null @@ -1,654 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# interaction plots \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006150356.r b/.history/eohi1/mixed anova - DGEN_20251006150356.r deleted file mode 100644 index 773c842..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006150356.r +++ /dev/null @@ -1,654 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# interaction plots \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006150414.r b/.history/eohi1/mixed anova - DGEN_20251006150414.r deleted file mode 100644 index f7944d0..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006150414.r +++ /dev/null @@ -1,642 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# interaction plots \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006150433.r b/.history/eohi1/mixed anova - DGEN_20251006150433.r deleted file mode 100644 index 21803bb..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006150433.r +++ /dev/null @@ -1,725 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot -interaction_plot_temporal_domain <- ggplot() + - geom_point( - data = iPlot_temporal_domain, - aes(x = TEMPORAL_DO, y = DGEN_SCORE, color = DOMAIN), - position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.2), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_temporal_domain, - aes( - xmin = as.numeric(TEMPORAL_DO) - 0.15 + (as.numeric(DOMAIN) - 2.5) * 0.25, - xmax = as.numeric(TEMPORAL_DO) + 0.15 + (as.numeric(DOMAIN) - 2.5) * 0.25, - ymin = ci_lower, ymax = ci_upper, - fill = DOMAIN - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_temporal_domain, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(DOMAIN) - 2.5) * 0.25, - xend = as.numeric(TEMPORAL_DO) + (as.numeric(DOMAIN) - 2.5) * 0.25, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_temporal_domain, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(DOMAIN) - 2.5) * 0.25, - y = plot_mean, - color = DOMAIN, - shape = DOMAIN - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "TEMPORAL_DO", y = "DGEN Score", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_color_manual(name = "DOMAIN", values = domain_colors) + - scale_fill_manual(name = "DOMAIN", values = domain_colors) + - scale_shape_manual(name = "DOMAIN", values = c(21, 22, 23, 24)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006150451.r b/.history/eohi1/mixed anova - DGEN_20251006150451.r deleted file mode 100644 index 21803bb..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006150451.r +++ /dev/null @@ -1,725 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot -interaction_plot_temporal_domain <- ggplot() + - geom_point( - data = iPlot_temporal_domain, - aes(x = TEMPORAL_DO, y = DGEN_SCORE, color = DOMAIN), - position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.2), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_temporal_domain, - aes( - xmin = as.numeric(TEMPORAL_DO) - 0.15 + (as.numeric(DOMAIN) - 2.5) * 0.25, - xmax = as.numeric(TEMPORAL_DO) + 0.15 + (as.numeric(DOMAIN) - 2.5) * 0.25, - ymin = ci_lower, ymax = ci_upper, - fill = DOMAIN - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_temporal_domain, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(DOMAIN) - 2.5) * 0.25, - xend = as.numeric(TEMPORAL_DO) + (as.numeric(DOMAIN) - 2.5) * 0.25, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_temporal_domain, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(DOMAIN) - 2.5) * 0.25, - y = plot_mean, - color = DOMAIN, - shape = DOMAIN - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "TEMPORAL_DO", y = "DGEN Score", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_color_manual(name = "DOMAIN", values = domain_colors) + - scale_fill_manual(name = "DOMAIN", values = domain_colors) + - scale_shape_manual(name = "DOMAIN", values = c(21, 22, 23, 24)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006150515.r b/.history/eohi1/mixed anova - DGEN_20251006150515.r deleted file mode 100644 index 21803bb..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006150515.r +++ /dev/null @@ -1,725 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot -interaction_plot_temporal_domain <- ggplot() + - geom_point( - data = iPlot_temporal_domain, - aes(x = TEMPORAL_DO, y = DGEN_SCORE, color = DOMAIN), - position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.2), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_temporal_domain, - aes( - xmin = as.numeric(TEMPORAL_DO) - 0.15 + (as.numeric(DOMAIN) - 2.5) * 0.25, - xmax = as.numeric(TEMPORAL_DO) + 0.15 + (as.numeric(DOMAIN) - 2.5) * 0.25, - ymin = ci_lower, ymax = ci_upper, - fill = DOMAIN - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_temporal_domain, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(DOMAIN) - 2.5) * 0.25, - xend = as.numeric(TEMPORAL_DO) + (as.numeric(DOMAIN) - 2.5) * 0.25, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_temporal_domain, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(DOMAIN) - 2.5) * 0.25, - y = plot_mean, - color = DOMAIN, - shape = DOMAIN - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "TEMPORAL_DO", y = "DGEN Score", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_color_manual(name = "DOMAIN", values = domain_colors) + - scale_fill_manual(name = "DOMAIN", values = domain_colors) + - scale_shape_manual(name = "DOMAIN", values = c(21, 22, 23, 24)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006151447.r b/.history/eohi1/mixed anova - DGEN_20251006151447.r deleted file mode 100644 index 3e4c20e..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006151447.r +++ /dev/null @@ -1,774 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot using violin plots -interaction_plot_temporal_domain <- ggplot() + - # Violin plots to show distribution - geom_violin( - data = iPlot_temporal_domain, - aes(x = TEMPORAL_DO, y = DGEN_SCORE, fill = DOMAIN), - position = position_dodge(width = 0.8), - alpha = 0.4, - color = "black", - trim = FALSE - ) + - # Add boxplot for quartiles - geom_boxplot( - data = iPlot_temporal_domain, - aes(x = TEMPORAL_DO, y = DGEN_SCORE, fill = DOMAIN), - position = position_dodge(width = 0.8), - width = 0.2, - alpha = 0.6, - outlier.shape = NA, - color = "black" - ) + - # Add emmeans with error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(DOMAIN) - 2.5) * 0.2, - ymin = ci_lower, - ymax = ci_upper, - color = DOMAIN - ), - width = 0.15, - linewidth = 1.2, - position = position_dodge(width = 0) - ) + - # Add emmeans points - geom_point( - data = emmeans_temporal_domain, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(DOMAIN) - 2.5) * 0.2, - y = plot_mean, - shape = DOMAIN - ), - size = 3.5, - fill = "white", - stroke = 1.5, - color = "black" - ) + - labs( - x = "TEMPORAL_DO", y = "DGEN Score", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_color_manual(name = "DOMAIN", values = domain_colors) + - scale_fill_manual(name = "DOMAIN", values = domain_colors) + - scale_shape_manual(name = "DOMAIN", values = c(21, 22, 23, 24)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right" - ) - -print(interaction_plot_temporal_domain) - -# Alternative: Faceted version for even clearer visualization -interaction_plot_faceted <- ggplot(iPlot_temporal_domain, - aes(x = TEMPORAL_DO, y = DGEN_SCORE, fill = DOMAIN, color = DOMAIN)) + - geom_violin(alpha = 0.4, color = "black", trim = FALSE) + - geom_boxplot(width = 0.3, alpha = 0.6, color = "black", outlier.shape = NA) + - geom_point( - data = emmeans_temporal_domain, - aes(x = TEMPORAL_DO, y = plot_mean), - shape = 23, - size = 4, - fill = "white", - color = "black", - stroke = 1.5 - ) + - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = TEMPORAL_DO, ymin = ci_lower, ymax = ci_upper), - width = 0.2, - linewidth = 1, - color = "black" - ) + - facet_wrap(~DOMAIN, nrow = 1) + - labs( - x = "TEMPORAL_DO", y = "DGEN Score", - title = "TEMPORAL_DO × DOMAIN Interaction (Faceted)" - ) + - scale_fill_manual(values = domain_colors) + - scale_color_manual(values = domain_colors) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "none", - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_faceted) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006151454.r b/.history/eohi1/mixed anova - DGEN_20251006151454.r deleted file mode 100644 index 3e4c20e..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006151454.r +++ /dev/null @@ -1,774 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot using violin plots -interaction_plot_temporal_domain <- ggplot() + - # Violin plots to show distribution - geom_violin( - data = iPlot_temporal_domain, - aes(x = TEMPORAL_DO, y = DGEN_SCORE, fill = DOMAIN), - position = position_dodge(width = 0.8), - alpha = 0.4, - color = "black", - trim = FALSE - ) + - # Add boxplot for quartiles - geom_boxplot( - data = iPlot_temporal_domain, - aes(x = TEMPORAL_DO, y = DGEN_SCORE, fill = DOMAIN), - position = position_dodge(width = 0.8), - width = 0.2, - alpha = 0.6, - outlier.shape = NA, - color = "black" - ) + - # Add emmeans with error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(DOMAIN) - 2.5) * 0.2, - ymin = ci_lower, - ymax = ci_upper, - color = DOMAIN - ), - width = 0.15, - linewidth = 1.2, - position = position_dodge(width = 0) - ) + - # Add emmeans points - geom_point( - data = emmeans_temporal_domain, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(DOMAIN) - 2.5) * 0.2, - y = plot_mean, - shape = DOMAIN - ), - size = 3.5, - fill = "white", - stroke = 1.5, - color = "black" - ) + - labs( - x = "TEMPORAL_DO", y = "DGEN Score", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_color_manual(name = "DOMAIN", values = domain_colors) + - scale_fill_manual(name = "DOMAIN", values = domain_colors) + - scale_shape_manual(name = "DOMAIN", values = c(21, 22, 23, 24)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right" - ) - -print(interaction_plot_temporal_domain) - -# Alternative: Faceted version for even clearer visualization -interaction_plot_faceted <- ggplot(iPlot_temporal_domain, - aes(x = TEMPORAL_DO, y = DGEN_SCORE, fill = DOMAIN, color = DOMAIN)) + - geom_violin(alpha = 0.4, color = "black", trim = FALSE) + - geom_boxplot(width = 0.3, alpha = 0.6, color = "black", outlier.shape = NA) + - geom_point( - data = emmeans_temporal_domain, - aes(x = TEMPORAL_DO, y = plot_mean), - shape = 23, - size = 4, - fill = "white", - color = "black", - stroke = 1.5 - ) + - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = TEMPORAL_DO, ymin = ci_lower, ymax = ci_upper), - width = 0.2, - linewidth = 1, - color = "black" - ) + - facet_wrap(~DOMAIN, nrow = 1) + - labs( - x = "TEMPORAL_DO", y = "DGEN Score", - title = "TEMPORAL_DO × DOMAIN Interaction (Faceted)" - ) + - scale_fill_manual(values = domain_colors) + - scale_color_manual(values = domain_colors) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "none", - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_faceted) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006151507.r b/.history/eohi1/mixed anova - DGEN_20251006151507.r deleted file mode 100644 index 1611a9a..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006151507.r +++ /dev/null @@ -1,736 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot using violin plots -interaction_plot_temporal_domain <- ggplot() + - # Violin plots to show distribution - geom_violin( - data = iPlot_temporal_domain, - aes(x = TEMPORAL_DO, y = DGEN_SCORE, fill = DOMAIN), - position = position_dodge(width = 0.8), - alpha = 0.4, - color = "black", - trim = FALSE - ) + - # Add boxplot for quartiles - geom_boxplot( - data = iPlot_temporal_domain, - aes(x = TEMPORAL_DO, y = DGEN_SCORE, fill = DOMAIN), - position = position_dodge(width = 0.8), - width = 0.2, - alpha = 0.6, - outlier.shape = NA, - color = "black" - ) + - # Add emmeans with error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(DOMAIN) - 2.5) * 0.2, - ymin = ci_lower, - ymax = ci_upper, - color = DOMAIN - ), - width = 0.15, - linewidth = 1.2, - position = position_dodge(width = 0) - ) + - # Add emmeans points - geom_point( - data = emmeans_temporal_domain, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(DOMAIN) - 2.5) * 0.2, - y = plot_mean, - shape = DOMAIN - ), - size = 3.5, - fill = "white", - stroke = 1.5, - color = "black" - ) + - labs( - x = "TEMPORAL_DO", y = "DGEN Score", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_color_manual(name = "DOMAIN", values = domain_colors) + - scale_fill_manual(name = "DOMAIN", values = domain_colors) + - scale_shape_manual(name = "DOMAIN", values = c(21, 22, 23, 24)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right" - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006151514.r b/.history/eohi1/mixed anova - DGEN_20251006151514.r deleted file mode 100644 index 1611a9a..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006151514.r +++ /dev/null @@ -1,736 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot using violin plots -interaction_plot_temporal_domain <- ggplot() + - # Violin plots to show distribution - geom_violin( - data = iPlot_temporal_domain, - aes(x = TEMPORAL_DO, y = DGEN_SCORE, fill = DOMAIN), - position = position_dodge(width = 0.8), - alpha = 0.4, - color = "black", - trim = FALSE - ) + - # Add boxplot for quartiles - geom_boxplot( - data = iPlot_temporal_domain, - aes(x = TEMPORAL_DO, y = DGEN_SCORE, fill = DOMAIN), - position = position_dodge(width = 0.8), - width = 0.2, - alpha = 0.6, - outlier.shape = NA, - color = "black" - ) + - # Add emmeans with error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(DOMAIN) - 2.5) * 0.2, - ymin = ci_lower, - ymax = ci_upper, - color = DOMAIN - ), - width = 0.15, - linewidth = 1.2, - position = position_dodge(width = 0) - ) + - # Add emmeans points - geom_point( - data = emmeans_temporal_domain, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(DOMAIN) - 2.5) * 0.2, - y = plot_mean, - shape = DOMAIN - ), - size = 3.5, - fill = "white", - stroke = 1.5, - color = "black" - ) + - labs( - x = "TEMPORAL_DO", y = "DGEN Score", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_color_manual(name = "DOMAIN", values = domain_colors) + - scale_fill_manual(name = "DOMAIN", values = domain_colors) + - scale_shape_manual(name = "DOMAIN", values = c(21, 22, 23, 24)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right" - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006151518.r b/.history/eohi1/mixed anova - DGEN_20251006151518.r deleted file mode 100644 index 1611a9a..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006151518.r +++ /dev/null @@ -1,736 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot using violin plots -interaction_plot_temporal_domain <- ggplot() + - # Violin plots to show distribution - geom_violin( - data = iPlot_temporal_domain, - aes(x = TEMPORAL_DO, y = DGEN_SCORE, fill = DOMAIN), - position = position_dodge(width = 0.8), - alpha = 0.4, - color = "black", - trim = FALSE - ) + - # Add boxplot for quartiles - geom_boxplot( - data = iPlot_temporal_domain, - aes(x = TEMPORAL_DO, y = DGEN_SCORE, fill = DOMAIN), - position = position_dodge(width = 0.8), - width = 0.2, - alpha = 0.6, - outlier.shape = NA, - color = "black" - ) + - # Add emmeans with error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(DOMAIN) - 2.5) * 0.2, - ymin = ci_lower, - ymax = ci_upper, - color = DOMAIN - ), - width = 0.15, - linewidth = 1.2, - position = position_dodge(width = 0) - ) + - # Add emmeans points - geom_point( - data = emmeans_temporal_domain, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(DOMAIN) - 2.5) * 0.2, - y = plot_mean, - shape = DOMAIN - ), - size = 3.5, - fill = "white", - stroke = 1.5, - color = "black" - ) + - labs( - x = "TEMPORAL_DO", y = "DGEN Score", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_color_manual(name = "DOMAIN", values = domain_colors) + - scale_fill_manual(name = "DOMAIN", values = domain_colors) + - scale_shape_manual(name = "DOMAIN", values = c(21, 22, 23, 24)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right" - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006152215.r b/.history/eohi1/mixed anova - DGEN_20251006152215.r deleted file mode 100644 index a180eb5..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006152215.r +++ /dev/null @@ -1,743 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot using violin plots -interaction_plot_temporal_domain <- ggplot() + - # Violin plots to show distribution - geom_violin( - data = iPlot_temporal_domain, - aes(x = TEMPORAL_DO, y = DGEN_SCORE, fill = DOMAIN), - position = position_dodge(width = 0.9), - alpha = 0.5, - color = "black", - trim = FALSE, - linewidth = 0.5 - ) + - # Add subtle median line - stat_summary( - data = iPlot_temporal_domain, - aes(x = TEMPORAL_DO, y = DGEN_SCORE, group = DOMAIN), - fun = median, - geom = "crossbar", - width = 0.15, - position = position_dodge(width = 0.9), - color = "gray30", - linewidth = 0.3, - fatten = 0 - ) + - # Add emmeans with error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(DOMAIN) - 2.5) * 0.225, - ymin = ci_lower, - ymax = ci_upper - ), - width = 0, - linewidth = 1.2, - color = "black" - ) + - # Add emmeans points - geom_point( - data = emmeans_temporal_domain, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(DOMAIN) - 2.5) * 0.225, - y = plot_mean, - shape = DOMAIN - ), - size = 4, - fill = "white", - stroke = 1.2, - color = "black" - ) + - labs( - x = "Temporal Distance Orientation", - y = "DGEN Score", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 12) + - theme( - axis.text.x = element_text(size = 11), - axis.text.y = element_text(size = 10), - axis.title = element_text(size = 12, face = "bold"), - plot.title = element_text(size = 13, hjust = 0.5, face = "bold"), - legend.position = "right", - legend.title = element_text(size = 11, face = "bold"), - legend.text = element_text(size = 10), - panel.grid.minor = element_blank() - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006152226.r b/.history/eohi1/mixed anova - DGEN_20251006152226.r deleted file mode 100644 index a180eb5..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006152226.r +++ /dev/null @@ -1,743 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot using violin plots -interaction_plot_temporal_domain <- ggplot() + - # Violin plots to show distribution - geom_violin( - data = iPlot_temporal_domain, - aes(x = TEMPORAL_DO, y = DGEN_SCORE, fill = DOMAIN), - position = position_dodge(width = 0.9), - alpha = 0.5, - color = "black", - trim = FALSE, - linewidth = 0.5 - ) + - # Add subtle median line - stat_summary( - data = iPlot_temporal_domain, - aes(x = TEMPORAL_DO, y = DGEN_SCORE, group = DOMAIN), - fun = median, - geom = "crossbar", - width = 0.15, - position = position_dodge(width = 0.9), - color = "gray30", - linewidth = 0.3, - fatten = 0 - ) + - # Add emmeans with error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(DOMAIN) - 2.5) * 0.225, - ymin = ci_lower, - ymax = ci_upper - ), - width = 0, - linewidth = 1.2, - color = "black" - ) + - # Add emmeans points - geom_point( - data = emmeans_temporal_domain, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(DOMAIN) - 2.5) * 0.225, - y = plot_mean, - shape = DOMAIN - ), - size = 4, - fill = "white", - stroke = 1.2, - color = "black" - ) + - labs( - x = "Temporal Distance Orientation", - y = "DGEN Score", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 12) + - theme( - axis.text.x = element_text(size = 11), - axis.text.y = element_text(size = 10), - axis.title = element_text(size = 12, face = "bold"), - plot.title = element_text(size = 13, hjust = 0.5, face = "bold"), - legend.position = "right", - legend.title = element_text(size = 11, face = "bold"), - legend.text = element_text(size = 10), - panel.grid.minor = element_blank() - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006152312.r b/.history/eohi1/mixed anova - DGEN_20251006152312.r deleted file mode 100644 index a180eb5..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006152312.r +++ /dev/null @@ -1,743 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot using violin plots -interaction_plot_temporal_domain <- ggplot() + - # Violin plots to show distribution - geom_violin( - data = iPlot_temporal_domain, - aes(x = TEMPORAL_DO, y = DGEN_SCORE, fill = DOMAIN), - position = position_dodge(width = 0.9), - alpha = 0.5, - color = "black", - trim = FALSE, - linewidth = 0.5 - ) + - # Add subtle median line - stat_summary( - data = iPlot_temporal_domain, - aes(x = TEMPORAL_DO, y = DGEN_SCORE, group = DOMAIN), - fun = median, - geom = "crossbar", - width = 0.15, - position = position_dodge(width = 0.9), - color = "gray30", - linewidth = 0.3, - fatten = 0 - ) + - # Add emmeans with error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(DOMAIN) - 2.5) * 0.225, - ymin = ci_lower, - ymax = ci_upper - ), - width = 0, - linewidth = 1.2, - color = "black" - ) + - # Add emmeans points - geom_point( - data = emmeans_temporal_domain, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(DOMAIN) - 2.5) * 0.225, - y = plot_mean, - shape = DOMAIN - ), - size = 4, - fill = "white", - stroke = 1.2, - color = "black" - ) + - labs( - x = "Temporal Distance Orientation", - y = "DGEN Score", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 12) + - theme( - axis.text.x = element_text(size = 11), - axis.text.y = element_text(size = 10), - axis.title = element_text(size = 12, face = "bold"), - plot.title = element_text(size = 13, hjust = 0.5, face = "bold"), - legend.position = "right", - legend.title = element_text(size = 11, face = "bold"), - legend.text = element_text(size = 10), - panel.grid.minor = element_blank() - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006152851.r b/.history/eohi1/mixed anova - DGEN_20251006152851.r deleted file mode 100644 index f27ad7f..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006152851.r +++ /dev/null @@ -1,738 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Add x position for plotting -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate(x_pos = as.numeric(TEMPORAL_DO)) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - half violins - geom_violin( - data = iPlot_temporal_domain, - aes(x = TEMPORAL_DO, y = DGEN_SCORE, fill = DOMAIN, group = interaction(TEMPORAL_DO, DOMAIN)), - position = position_dodge(width = 0.8), - alpha = 0.3, - color = NA, - trim = FALSE, - scale = "width" - ) + - # Connect emmeans with lines to show interaction - geom_line( - data = emmeans_temporal_domain, - aes(x = x_pos, y = plot_mean, color = DOMAIN, group = DOMAIN), - linewidth = 1, - alpha = 0.8 - ) + - # Confidence interval ribbons - geom_ribbon( - data = emmeans_temporal_domain, - aes(x = x_pos, ymin = ci_lower, ymax = ci_upper, fill = DOMAIN, group = DOMAIN), - alpha = 0.2, - color = NA - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_pos, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Temporal Distance Orientation", - y = "DGEN Score", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("01PAST", "02FUT") - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006152926.r b/.history/eohi1/mixed anova - DGEN_20251006152926.r deleted file mode 100644 index f27ad7f..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006152926.r +++ /dev/null @@ -1,738 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Add x position for plotting -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate(x_pos = as.numeric(TEMPORAL_DO)) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - half violins - geom_violin( - data = iPlot_temporal_domain, - aes(x = TEMPORAL_DO, y = DGEN_SCORE, fill = DOMAIN, group = interaction(TEMPORAL_DO, DOMAIN)), - position = position_dodge(width = 0.8), - alpha = 0.3, - color = NA, - trim = FALSE, - scale = "width" - ) + - # Connect emmeans with lines to show interaction - geom_line( - data = emmeans_temporal_domain, - aes(x = x_pos, y = plot_mean, color = DOMAIN, group = DOMAIN), - linewidth = 1, - alpha = 0.8 - ) + - # Confidence interval ribbons - geom_ribbon( - data = emmeans_temporal_domain, - aes(x = x_pos, ymin = ci_lower, ymax = ci_upper, fill = DOMAIN, group = DOMAIN), - alpha = 0.2, - color = NA - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_pos, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Temporal Distance Orientation", - y = "DGEN Score", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("01PAST", "02FUT") - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006152934.r b/.history/eohi1/mixed anova - DGEN_20251006152934.r deleted file mode 100644 index f27ad7f..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006152934.r +++ /dev/null @@ -1,738 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Add x position for plotting -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate(x_pos = as.numeric(TEMPORAL_DO)) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - half violins - geom_violin( - data = iPlot_temporal_domain, - aes(x = TEMPORAL_DO, y = DGEN_SCORE, fill = DOMAIN, group = interaction(TEMPORAL_DO, DOMAIN)), - position = position_dodge(width = 0.8), - alpha = 0.3, - color = NA, - trim = FALSE, - scale = "width" - ) + - # Connect emmeans with lines to show interaction - geom_line( - data = emmeans_temporal_domain, - aes(x = x_pos, y = plot_mean, color = DOMAIN, group = DOMAIN), - linewidth = 1, - alpha = 0.8 - ) + - # Confidence interval ribbons - geom_ribbon( - data = emmeans_temporal_domain, - aes(x = x_pos, ymin = ci_lower, ymax = ci_upper, fill = DOMAIN, group = DOMAIN), - alpha = 0.2, - color = NA - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_pos, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Temporal Distance Orientation", - y = "DGEN Score", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("01PAST", "02FUT") - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006153823.r b/.history/eohi1/mixed anova - DGEN_20251006153823.r deleted file mode 100644 index 7a04c62..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006153823.r +++ /dev/null @@ -1,752 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Recode labels for plotting -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate(TEMPORAL_DO_label = factor( - TEMPORAL_DO, - levels = c("01PAST", "02FUT"), - labels = c("Past First", "Future First") - )) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - TEMPORAL_DO_label = factor( - TEMPORAL_DO, - levels = c("01PAST", "02FUT"), - labels = c("Past First", "Future First") - ), - x_pos = as.numeric(TEMPORAL_DO) - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - half violins - geom_violin( - data = iPlot_temporal_domain, - aes(x = TEMPORAL_DO_label, y = DGEN_SCORE, fill = DOMAIN, group = interaction(TEMPORAL_DO_label, DOMAIN)), - position = position_dodge(width = 0.8), - alpha = 0.3, - color = NA, - trim = FALSE, - scale = "width" - ) + - # Connect emmeans with lines to show interaction - geom_line( - data = emmeans_temporal_domain, - aes(x = x_pos, y = plot_mean, color = DOMAIN, group = DOMAIN), - linewidth = 1, - alpha = 0.8 - ) + - # Confidence interval ribbons - geom_ribbon( - data = emmeans_temporal_domain, - aes(x = x_pos, ymin = ci_lower, ymax = ci_upper, fill = DOMAIN, group = DOMAIN), - alpha = 0.2, - color = NA - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_pos, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First") - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006153831.r b/.history/eohi1/mixed anova - DGEN_20251006153831.r deleted file mode 100644 index 7a04c62..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006153831.r +++ /dev/null @@ -1,752 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Recode labels for plotting -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate(TEMPORAL_DO_label = factor( - TEMPORAL_DO, - levels = c("01PAST", "02FUT"), - labels = c("Past First", "Future First") - )) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - TEMPORAL_DO_label = factor( - TEMPORAL_DO, - levels = c("01PAST", "02FUT"), - labels = c("Past First", "Future First") - ), - x_pos = as.numeric(TEMPORAL_DO) - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - half violins - geom_violin( - data = iPlot_temporal_domain, - aes(x = TEMPORAL_DO_label, y = DGEN_SCORE, fill = DOMAIN, group = interaction(TEMPORAL_DO_label, DOMAIN)), - position = position_dodge(width = 0.8), - alpha = 0.3, - color = NA, - trim = FALSE, - scale = "width" - ) + - # Connect emmeans with lines to show interaction - geom_line( - data = emmeans_temporal_domain, - aes(x = x_pos, y = plot_mean, color = DOMAIN, group = DOMAIN), - linewidth = 1, - alpha = 0.8 - ) + - # Confidence interval ribbons - geom_ribbon( - data = emmeans_temporal_domain, - aes(x = x_pos, ymin = ci_lower, ymax = ci_upper, fill = DOMAIN, group = DOMAIN), - alpha = 0.2, - color = NA - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_pos, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First") - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006153845.r b/.history/eohi1/mixed anova - DGEN_20251006153845.r deleted file mode 100644 index 7a04c62..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006153845.r +++ /dev/null @@ -1,752 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Recode labels for plotting -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate(TEMPORAL_DO_label = factor( - TEMPORAL_DO, - levels = c("01PAST", "02FUT"), - labels = c("Past First", "Future First") - )) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - TEMPORAL_DO_label = factor( - TEMPORAL_DO, - levels = c("01PAST", "02FUT"), - labels = c("Past First", "Future First") - ), - x_pos = as.numeric(TEMPORAL_DO) - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - half violins - geom_violin( - data = iPlot_temporal_domain, - aes(x = TEMPORAL_DO_label, y = DGEN_SCORE, fill = DOMAIN, group = interaction(TEMPORAL_DO_label, DOMAIN)), - position = position_dodge(width = 0.8), - alpha = 0.3, - color = NA, - trim = FALSE, - scale = "width" - ) + - # Connect emmeans with lines to show interaction - geom_line( - data = emmeans_temporal_domain, - aes(x = x_pos, y = plot_mean, color = DOMAIN, group = DOMAIN), - linewidth = 1, - alpha = 0.8 - ) + - # Confidence interval ribbons - geom_ribbon( - data = emmeans_temporal_domain, - aes(x = x_pos, ymin = ci_lower, ymax = ci_upper, fill = DOMAIN, group = DOMAIN), - alpha = 0.2, - color = NA - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_pos, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First") - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006154345.r b/.history/eohi1/mixed anova - DGEN_20251006154345.r deleted file mode 100644 index 8d31c47..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006154345.r +++ /dev/null @@ -1,742 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis for all layers -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate(x_pos = as.numeric(TEMPORAL_DO)) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate(x_pos = as.numeric(TEMPORAL_DO)) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_pos, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - position = position_dodge(width = 0.15), - alpha = 0.3, - color = NA, - trim = FALSE, - scale = "width" - ) + - # Connect emmeans with lines to show interaction - geom_line( - data = emmeans_temporal_domain, - aes(x = x_pos, y = plot_mean, color = DOMAIN, group = DOMAIN), - linewidth = 1, - alpha = 0.8 - ) + - # Confidence interval ribbons - geom_ribbon( - data = emmeans_temporal_domain, - aes(x = x_pos, ymin = ci_lower, ymax = ci_upper, fill = DOMAIN, group = DOMAIN), - alpha = 0.2, - color = NA - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_pos, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006154352.r b/.history/eohi1/mixed anova - DGEN_20251006154352.r deleted file mode 100644 index 8d31c47..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006154352.r +++ /dev/null @@ -1,742 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis for all layers -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate(x_pos = as.numeric(TEMPORAL_DO)) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate(x_pos = as.numeric(TEMPORAL_DO)) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_pos, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - position = position_dodge(width = 0.15), - alpha = 0.3, - color = NA, - trim = FALSE, - scale = "width" - ) + - # Connect emmeans with lines to show interaction - geom_line( - data = emmeans_temporal_domain, - aes(x = x_pos, y = plot_mean, color = DOMAIN, group = DOMAIN), - linewidth = 1, - alpha = 0.8 - ) + - # Confidence interval ribbons - geom_ribbon( - data = emmeans_temporal_domain, - aes(x = x_pos, ymin = ci_lower, ymax = ci_upper, fill = DOMAIN, group = DOMAIN), - alpha = 0.2, - color = NA - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_pos, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006154403.r b/.history/eohi1/mixed anova - DGEN_20251006154403.r deleted file mode 100644 index 8d31c47..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006154403.r +++ /dev/null @@ -1,742 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis for all layers -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate(x_pos = as.numeric(TEMPORAL_DO)) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate(x_pos = as.numeric(TEMPORAL_DO)) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_pos, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - position = position_dodge(width = 0.15), - alpha = 0.3, - color = NA, - trim = FALSE, - scale = "width" - ) + - # Connect emmeans with lines to show interaction - geom_line( - data = emmeans_temporal_domain, - aes(x = x_pos, y = plot_mean, color = DOMAIN, group = DOMAIN), - linewidth = 1, - alpha = 0.8 - ) + - # Confidence interval ribbons - geom_ribbon( - data = emmeans_temporal_domain, - aes(x = x_pos, ymin = ci_lower, ymax = ci_upper, fill = DOMAIN, group = DOMAIN), - alpha = 0.2, - color = NA - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_pos, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006155006.r b/.history/eohi1/mixed anova - DGEN_20251006155006.r deleted file mode 100644 index 1a260ef..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006155006.r +++ /dev/null @@ -1,756 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.3, - color = "black", - linewidth = 0.3, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Connect emmeans with lines to show interaction - geom_line( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, color = DOMAIN, group = DOMAIN), - linewidth = 1, - alpha = 0.8 - ) + - # Confidence interval ribbons - geom_ribbon( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, fill = DOMAIN, group = DOMAIN), - alpha = 0.2, - color = NA - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006155018.r b/.history/eohi1/mixed anova - DGEN_20251006155018.r deleted file mode 100644 index 1a260ef..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006155018.r +++ /dev/null @@ -1,756 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.3, - color = "black", - linewidth = 0.3, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Connect emmeans with lines to show interaction - geom_line( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, color = DOMAIN, group = DOMAIN), - linewidth = 1, - alpha = 0.8 - ) + - # Confidence interval ribbons - geom_ribbon( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, fill = DOMAIN, group = DOMAIN), - alpha = 0.2, - color = NA - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006155049.r b/.history/eohi1/mixed anova - DGEN_20251006155049.r deleted file mode 100644 index 1a260ef..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006155049.r +++ /dev/null @@ -1,756 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.3, - color = "black", - linewidth = 0.3, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Connect emmeans with lines to show interaction - geom_line( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, color = DOMAIN, group = DOMAIN), - linewidth = 1, - alpha = 0.8 - ) + - # Confidence interval ribbons - geom_ribbon( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, fill = DOMAIN, group = DOMAIN), - alpha = 0.2, - color = NA - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006155243.r b/.history/eohi1/mixed anova - DGEN_20251006155243.r deleted file mode 100644 index e1bc255..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006155243.r +++ /dev/null @@ -1,748 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Confidence interval ribbons - geom_ribbon( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, fill = DOMAIN, group = DOMAIN), - alpha = 0.2, - color = NA - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006155250.r b/.history/eohi1/mixed anova - DGEN_20251006155250.r deleted file mode 100644 index e1bc255..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006155250.r +++ /dev/null @@ -1,748 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Confidence interval ribbons - geom_ribbon( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, fill = DOMAIN, group = DOMAIN), - alpha = 0.2, - color = NA - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006155253.r b/.history/eohi1/mixed anova - DGEN_20251006155253.r deleted file mode 100644 index e1bc255..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006155253.r +++ /dev/null @@ -1,748 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Confidence interval ribbons - geom_ribbon( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, fill = DOMAIN, group = DOMAIN), - alpha = 0.2, - color = NA - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006155346.r b/.history/eohi1/mixed anova - DGEN_20251006155346.r deleted file mode 100644 index 628d1d4..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006155346.r +++ /dev/null @@ -1,749 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Emmeans error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006155349.r b/.history/eohi1/mixed anova - DGEN_20251006155349.r deleted file mode 100644 index 628d1d4..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006155349.r +++ /dev/null @@ -1,749 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Emmeans error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006155353.r b/.history/eohi1/mixed anova - DGEN_20251006155353.r deleted file mode 100644 index 628d1d4..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006155353.r +++ /dev/null @@ -1,749 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Emmeans error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006155709.r b/.history/eohi1/mixed anova - DGEN_20251006155709.r deleted file mode 100644 index 628d1d4..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006155709.r +++ /dev/null @@ -1,749 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Emmeans error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006172448.r b/.history/eohi1/mixed anova - DGEN_20251006172448.r deleted file mode 100644 index a719016..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006172448.r +++ /dev/null @@ -1,818 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Emmeans error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) - -# ============================================================================= -# EMMEANS-ONLY PLOT: TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -# Create fresh emmeans data for emmeans-only plot -emm_temporal_domain_simple <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -emmeans_temporal_domain_simple <- emm_temporal_domain_simple %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -# Create emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_domain_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = DOMAIN), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006172451.r b/.history/eohi1/mixed anova - DGEN_20251006172451.r deleted file mode 100644 index a719016..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006172451.r +++ /dev/null @@ -1,818 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Emmeans error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) - -# ============================================================================= -# EMMEANS-ONLY PLOT: TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -# Create fresh emmeans data for emmeans-only plot -emm_temporal_domain_simple <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -emmeans_temporal_domain_simple <- emm_temporal_domain_simple %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -# Create emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_domain_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = DOMAIN), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006172503.r b/.history/eohi1/mixed anova - DGEN_20251006172503.r deleted file mode 100644 index a719016..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006172503.r +++ /dev/null @@ -1,818 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Emmeans error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) - -# ============================================================================= -# EMMEANS-ONLY PLOT: TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -# Create fresh emmeans data for emmeans-only plot -emm_temporal_domain_simple <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -emmeans_temporal_domain_simple <- emm_temporal_domain_simple %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -# Create emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_domain_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = DOMAIN), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006172600.r b/.history/eohi1/mixed anova - DGEN_20251006172600.r deleted file mode 100644 index 15f16ea..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006172600.r +++ /dev/null @@ -1,818 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Emmeans error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) - -# ============================================================================= -# EMMEANS-ONLY PLOT: TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -# Create fresh emmeans data for emmeans-only plot -emm_temporal_domain_simple <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -emmeans_temporal_domain_simple <- emm_temporal_domain_simple %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -# Create emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_domain_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = DOMAIN), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Temporal Direction", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006172604.r b/.history/eohi1/mixed anova - DGEN_20251006172604.r deleted file mode 100644 index 15f16ea..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006172604.r +++ /dev/null @@ -1,818 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Emmeans error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) - -# ============================================================================= -# EMMEANS-ONLY PLOT: TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -# Create fresh emmeans data for emmeans-only plot -emm_temporal_domain_simple <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -emmeans_temporal_domain_simple <- emm_temporal_domain_simple %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -# Create emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_domain_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = DOMAIN), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Temporal Direction", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006172636.r b/.history/eohi1/mixed anova - DGEN_20251006172636.r deleted file mode 100644 index a719016..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006172636.r +++ /dev/null @@ -1,818 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Emmeans error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) - -# ============================================================================= -# EMMEANS-ONLY PLOT: TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -# Create fresh emmeans data for emmeans-only plot -emm_temporal_domain_simple <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -emmeans_temporal_domain_simple <- emm_temporal_domain_simple %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -# Create emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_domain_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = DOMAIN), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006181939.r b/.history/eohi1/mixed anova - DGEN_20251006181939.r deleted file mode 100644 index 83e455e..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006181939.r +++ /dev/null @@ -1,818 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Emmeans error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) - -# ============================================================================= -# EMMEANS-ONLY PLOT: TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -# Create fresh emmeans data for emmeans-only plot -emm_temporal_domain_simple <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -emmeans_temporal_domain_simple <- emm_temporal_domain_simple %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -# Create emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_domain_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = DOMAIN), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(2, 6), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006191125.r b/.history/eohi1/mixed anova - DGEN_20251006191125.r deleted file mode 100644 index faa9bbc..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006191125.r +++ /dev/null @@ -1,818 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Emmeans error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) - -# ============================================================================= -# EMMEANS-ONLY PLOT: TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -# Create fresh emmeans data for emmeans-only plot -emm_temporal_domain_simple <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -emmeans_temporal_domain_simple <- emm_temporal_domain_simple %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -# Create emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_domain_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = DOMAIN), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(3, 6), - breaks = seq(0, 10, 1) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006200121.r b/.history/eohi1/mixed anova - DGEN_20251006200121.r deleted file mode 100644 index 5d999c0..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006200121.r +++ /dev/null @@ -1,892 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Emmeans error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) - -# ============================================================================= -# EMMEANS-ONLY PLOT: TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -# Create fresh emmeans data for emmeans-only plot -emm_temporal_domain_simple <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -emmeans_temporal_domain_simple <- emm_temporal_domain_simple %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -# Create emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_domain_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = DOMAIN), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(3, 6), - breaks = seq(0, 10, 1) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================================= -# 3-WAY INTERACTION PLOT: TEMPORAL_DO × TIME × DOMAIN (Emmeans only) -# ============================================================================= - -print("=== 3-WAY INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × DOMAIN -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~DOMAIN, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006200138.r b/.history/eohi1/mixed anova - DGEN_20251006200138.r deleted file mode 100644 index 5d999c0..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006200138.r +++ /dev/null @@ -1,892 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Emmeans error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) - -# ============================================================================= -# EMMEANS-ONLY PLOT: TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -# Create fresh emmeans data for emmeans-only plot -emm_temporal_domain_simple <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -emmeans_temporal_domain_simple <- emm_temporal_domain_simple %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -# Create emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_domain_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = DOMAIN), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(3, 6), - breaks = seq(0, 10, 1) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================================= -# 3-WAY INTERACTION PLOT: TEMPORAL_DO × TIME × DOMAIN (Emmeans only) -# ============================================================================= - -print("=== 3-WAY INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × DOMAIN -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~DOMAIN, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006200320.r b/.history/eohi1/mixed anova - DGEN_20251006200320.r deleted file mode 100644 index faa9bbc..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006200320.r +++ /dev/null @@ -1,818 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Emmeans error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) - -# ============================================================================= -# EMMEANS-ONLY PLOT: TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -# Create fresh emmeans data for emmeans-only plot -emm_temporal_domain_simple <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -emmeans_temporal_domain_simple <- emm_temporal_domain_simple %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -# Create emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_domain_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = DOMAIN), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(3, 6), - breaks = seq(0, 10, 1) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006201108.r b/.history/eohi1/mixed anova - DGEN_20251006201108.r deleted file mode 100644 index faa9bbc..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006201108.r +++ /dev/null @@ -1,818 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Emmeans error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) - -# ============================================================================= -# EMMEANS-ONLY PLOT: TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -# Create fresh emmeans data for emmeans-only plot -emm_temporal_domain_simple <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -emmeans_temporal_domain_simple <- emm_temporal_domain_simple %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -# Create emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_domain_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = DOMAIN), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(3, 6), - breaks = seq(0, 10, 1) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251006201257.r b/.history/eohi1/mixed anova - DGEN_20251006201257.r deleted file mode 100644 index 291c500..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251006201257.r +++ /dev/null @@ -1,818 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Emmeans error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) - -# ============================================================================= -# EMMEANS-ONLY PLOT: TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -# Create fresh emmeans data for emmeans-only plot -emm_temporal_domain_simple <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -emmeans_temporal_domain_simple <- emm_temporal_domain_simple %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -# Create emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_domain_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = DOMAIN), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "TEMPORAL_DO × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(3, 6), - breaks = seq(0, 10, 1) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251007162848.r b/.history/eohi1/mixed anova - DGEN_20251007162848.r deleted file mode 100644 index 291c500..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251007162848.r +++ /dev/null @@ -1,818 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Emmeans error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) - -# ============================================================================= -# EMMEANS-ONLY PLOT: TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -# Create fresh emmeans data for emmeans-only plot -emm_temporal_domain_simple <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -emmeans_temporal_domain_simple <- emm_temporal_domain_simple %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -# Create emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_domain_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = DOMAIN), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "TEMPORAL_DO × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(3, 6), - breaks = seq(0, 10, 1) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251007162926.r b/.history/eohi1/mixed anova - DGEN_20251007162926.r deleted file mode 100644 index e27e36e..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251007162926.r +++ /dev/null @@ -1,875 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Emmeans error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) - -# ============================================================================= -# EMMEANS-ONLY PLOT: TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -# Create fresh emmeans data for emmeans-only plot -emm_temporal_domain_simple <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -emmeans_temporal_domain_simple <- emm_temporal_domain_simple %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -# Create emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_domain_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = DOMAIN), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "TEMPORAL_DO × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(3, 6), - breaks = seq(0, 10, 1) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars Only) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_main_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create TIME main-effect plot (style aligned with existing emmeans-only plot) -time_main_plot <- ggplot(time_main_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Absolute difference from the present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251007162939.r b/.history/eohi1/mixed anova - DGEN_20251007162939.r deleted file mode 100644 index e27e36e..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251007162939.r +++ /dev/null @@ -1,875 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Emmeans error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) - -# ============================================================================= -# EMMEANS-ONLY PLOT: TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -# Create fresh emmeans data for emmeans-only plot -emm_temporal_domain_simple <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -emmeans_temporal_domain_simple <- emm_temporal_domain_simple %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -# Create emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_domain_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = DOMAIN), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "TEMPORAL_DO × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(3, 6), - breaks = seq(0, 10, 1) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars Only) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_main_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create TIME main-effect plot (style aligned with existing emmeans-only plot) -time_main_plot <- ggplot(time_main_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Absolute difference from the present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251007162949.r b/.history/eohi1/mixed anova - DGEN_20251007162949.r deleted file mode 100644 index e27e36e..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251007162949.r +++ /dev/null @@ -1,875 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Emmeans error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) - -# ============================================================================= -# EMMEANS-ONLY PLOT: TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -# Create fresh emmeans data for emmeans-only plot -emm_temporal_domain_simple <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -emmeans_temporal_domain_simple <- emm_temporal_domain_simple %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -# Create emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_domain_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = DOMAIN), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "TEMPORAL_DO × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(3, 6), - breaks = seq(0, 10, 1) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars Only) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_main_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create TIME main-effect plot (style aligned with existing emmeans-only plot) -time_main_plot <- ggplot(time_main_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Absolute difference from the present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251007162951.r b/.history/eohi1/mixed anova - DGEN_20251007162951.r deleted file mode 100644 index e27e36e..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251007162951.r +++ /dev/null @@ -1,875 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Emmeans error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) - -# ============================================================================= -# EMMEANS-ONLY PLOT: TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -# Create fresh emmeans data for emmeans-only plot -emm_temporal_domain_simple <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -emmeans_temporal_domain_simple <- emm_temporal_domain_simple %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -# Create emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_domain_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = DOMAIN), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "TEMPORAL_DO × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(3, 6), - breaks = seq(0, 10, 1) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars Only) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_main_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create TIME main-effect plot (style aligned with existing emmeans-only plot) -time_main_plot <- ggplot(time_main_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Absolute difference from the present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251007180951.r b/.history/eohi1/mixed anova - DGEN_20251007180951.r deleted file mode 100644 index e27e36e..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251007180951.r +++ /dev/null @@ -1,875 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Emmeans error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) - -# ============================================================================= -# EMMEANS-ONLY PLOT: TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -# Create fresh emmeans data for emmeans-only plot -emm_temporal_domain_simple <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -emmeans_temporal_domain_simple <- emm_temporal_domain_simple %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -# Create emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_domain_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = DOMAIN), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "TEMPORAL_DO × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(3, 6), - breaks = seq(0, 10, 1) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars Only) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_main_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create TIME main-effect plot (style aligned with existing emmeans-only plot) -time_main_plot <- ggplot(time_main_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Absolute difference from the present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251010152730.r b/.history/eohi1/mixed anova - DGEN_20251010152730.r deleted file mode 100644 index e27e36e..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251010152730.r +++ /dev/null @@ -1,875 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Emmeans error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) - -# ============================================================================= -# EMMEANS-ONLY PLOT: TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -# Create fresh emmeans data for emmeans-only plot -emm_temporal_domain_simple <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -emmeans_temporal_domain_simple <- emm_temporal_domain_simple %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -# Create emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_domain_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = DOMAIN), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "TEMPORAL_DO × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(3, 6), - breaks = seq(0, 10, 1) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars Only) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_main_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create TIME main-effect plot (style aligned with existing emmeans-only plot) -time_main_plot <- ggplot(time_main_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Absolute difference from the present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - DGEN_20251010165036.r b/.history/eohi1/mixed anova - DGEN_20251010165036.r deleted file mode 100644 index e27e36e..0000000 --- a/.history/eohi1/mixed anova - DGEN_20251010165036.r +++ /dev/null @@ -1,875 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment Data Analysis - DGEN Level Analysis -# Variables: pastPref_DGEN, pastPers_DGEN, pastVal_DGEN, pastLife_DGEN -# futPref_DGEN, futPers_DGEN, futVal_DGEN, futLife_DGEN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Verify the specific variables we need -required_vars <- c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("pastPref_DGEN", "pastPers_DGEN", "pastVal_DGEN", "pastLife_DGEN", - "futPref_DGEN", "futPers_DGEN", "futVal_DGEN", "futLife_DGEN"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(DGEN_SCORE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(DGEN_SCORE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, DGEN_SCORE), "TEMPORAL_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "DGEN_SCORE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "DGEN_SCORE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "DGEN_SCORE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - -# ============================================================================= -# INTERACTION PLOTS -# ============================================================================= - -print("=== INTERACTION PLOTS ===") - -# Define color palette for DOMAIN (4 levels) -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# TEMPORAL_DO × DOMAIN INTERACTION PLOT -# Create estimated marginal means for TEMPORAL_DO × DOMAIN -emm_temporal_domain <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -# Prepare emmeans data frame -emmeans_temporal_domain <- emm_temporal_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Prepare raw data for plotting -iPlot_temporal_domain <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, DOMAIN, DGEN_SCORE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - ) - -# Create TEMPORAL_DO × DOMAIN interaction plot - clean line plot with distribution -# Convert to numeric x-axis and add position offsets for dodging -dodge_width <- 0.6 -iPlot_temporal_domain <- iPlot_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -emmeans_temporal_domain <- emmeans_temporal_domain %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -interaction_plot_temporal_domain <- ggplot() + - # Distribution layer - violins (completely separated) - geom_violin( - data = iPlot_temporal_domain, - aes(x = x_dodged, y = DGEN_SCORE, fill = DOMAIN, group = interaction(x_pos, DOMAIN)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width / 4 - ) + - # Emmeans error bars - geom_errorbar( - data = emmeans_temporal_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - # Emmeans points - geom_point( - data = emmeans_temporal_domain, - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(0, 10), - breaks = seq(0, 10, 2) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal_domain) - -# ============================================================================= -# EMMEANS-ONLY PLOT: TEMPORAL_DO × DOMAIN INTERACTION -# ============================================================================= - -# Create fresh emmeans data for emmeans-only plot -emm_temporal_domain_simple <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - -emmeans_temporal_domain_simple <- emm_temporal_domain_simple %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - x_pos = as.numeric(TEMPORAL_DO), - domain_offset = (as.numeric(DOMAIN) - 2.5) * (dodge_width / 4), - x_dodged = x_pos + domain_offset - ) - -# Create emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_domain_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = DOMAIN), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = DOMAIN, shape = DOMAIN), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "TEMPORAL_DO × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.4, 2.6) - ) + - scale_y_continuous( - limits = c(3, 6), - breaks = seq(0, 10, 1) - ) + - scale_color_manual(name = "Domain", values = domain_colors) + - scale_fill_manual(name = "Domain", values = domain_colors) + - scale_shape_manual(name = "Domain", values = c(21, 22, 23, 24)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars Only) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_main_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create TIME main-effect plot (style aligned with existing emmeans-only plot) -time_main_plot <- ggplot(time_main_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Absolute difference from the present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means SIMPLE_20251003125947.r b/.history/eohi1/mixed anova - domain means SIMPLE_20251003125947.r deleted file mode 100644 index 4d85d77..0000000 --- a/.history/eohi1/mixed anova - domain means SIMPLE_20251003125947.r +++ /dev/null @@ -1,320 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - SIMPLIFIED VERSION -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# SUMMARY OF KEY FINDINGS -# ============================================================================= - -print("\n=== SUMMARY OF KEY FINDINGS ===") -cat("SIGNIFICANT EFFECTS (p < 0.05):\n") -cat("• TIME main effect: F(1,1061) = 78.80, p < 0.001, η²G = 0.008\n") -cat("• DOMAIN main effect: F(3,3183) = 206.90, p < 0.001, η²G = 0.059\n") -cat("• TEMPORAL_DO × TIME interaction: F(1,1061) = 10.97, p = 0.001, η²G = 0.001\n") -cat("• TIME × DOMAIN interaction: F(3,3183) = 3.65, p = 0.012, η²G = 0.001\n") -cat("\nMARGINAL EFFECTS (p < 0.10):\n") -cat("• TEMPORAL_DO × DOMAIN interaction: F(3,3183) = 2.50, p = 0.058, η²G = 0.001\n") -cat("\nNON-SIGNIFICANT EFFECTS:\n") -cat("• TEMPORAL_DO main effect: F(1,1061) = 0.41, p = 0.524, η²G = 0.00015\n") -cat("• Three-way interaction: F(3,3183) = 0.77, p = 0.511, η²G = 0.00013\n") -cat("\nEFFECT SIZE INTERPRETATION (η²G):\n") -cat("• Large: η²G > 0.14\n") -cat("• Medium: η²G > 0.06\n") -cat("• Small: η²G > 0.01\n") -cat("• Negligible: η²G ≤ 0.01\n") - -print("\n=== ANALYSIS COMPLETE ===") -print("This simplified version includes all essential results without computationally intensive post-hoc comparisons.") -print("The main ANOVA results show all significant effects with proper sphericity corrections.") diff --git a/.history/eohi1/mixed anova - domain means SIMPLE_20251003130013.r b/.history/eohi1/mixed anova - domain means SIMPLE_20251003130013.r deleted file mode 100644 index 4d85d77..0000000 --- a/.history/eohi1/mixed anova - domain means SIMPLE_20251003130013.r +++ /dev/null @@ -1,320 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - SIMPLIFIED VERSION -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# SUMMARY OF KEY FINDINGS -# ============================================================================= - -print("\n=== SUMMARY OF KEY FINDINGS ===") -cat("SIGNIFICANT EFFECTS (p < 0.05):\n") -cat("• TIME main effect: F(1,1061) = 78.80, p < 0.001, η²G = 0.008\n") -cat("• DOMAIN main effect: F(3,3183) = 206.90, p < 0.001, η²G = 0.059\n") -cat("• TEMPORAL_DO × TIME interaction: F(1,1061) = 10.97, p = 0.001, η²G = 0.001\n") -cat("• TIME × DOMAIN interaction: F(3,3183) = 3.65, p = 0.012, η²G = 0.001\n") -cat("\nMARGINAL EFFECTS (p < 0.10):\n") -cat("• TEMPORAL_DO × DOMAIN interaction: F(3,3183) = 2.50, p = 0.058, η²G = 0.001\n") -cat("\nNON-SIGNIFICANT EFFECTS:\n") -cat("• TEMPORAL_DO main effect: F(1,1061) = 0.41, p = 0.524, η²G = 0.00015\n") -cat("• Three-way interaction: F(3,3183) = 0.77, p = 0.511, η²G = 0.00013\n") -cat("\nEFFECT SIZE INTERPRETATION (η²G):\n") -cat("• Large: η²G > 0.14\n") -cat("• Medium: η²G > 0.06\n") -cat("• Small: η²G > 0.01\n") -cat("• Negligible: η²G ≤ 0.01\n") - -print("\n=== ANALYSIS COMPLETE ===") -print("This simplified version includes all essential results without computationally intensive post-hoc comparisons.") -print("The main ANOVA results show all significant effects with proper sphericity corrections.") diff --git a/.history/eohi1/mixed anova - domain means_20250912153102.r b/.history/eohi1/mixed anova - domain means_20250912153102.r deleted file mode 100644 index 36c68c0..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912153102.r +++ /dev/null @@ -1,272 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_group <- long_data %>% - group_by(GROUP, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by GROUP, TIME, and DOMAIN:") -print(desc_stats_by_group) - -# Overall means by TIME (collapsed across domains) -time_means <- long_data %>% - group_by(TIME) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by TIME (collapsed across domains):") -print(time_means) - -# Overall means by DOMAIN (collapsed across time) -domain_means <- long_data %>% - group_by(DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by DOMAIN (collapsed across time):") -print(domain_means) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -print("Normality test results:") -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912153232.r b/.history/eohi1/mixed anova - domain means_20250912153232.r deleted file mode 100644 index d8ee82f..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912153232.r +++ /dev/null @@ -1,274 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -setwd("C:\Users\irina\Documents\DND\EOHI\eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_group <- long_data %>% - group_by(GROUP, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by GROUP, TIME, and DOMAIN:") -print(desc_stats_by_group) - -# Overall means by TIME (collapsed across domains) -time_means <- long_data %>% - group_by(TIME) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by TIME (collapsed across domains):") -print(time_means) - -# Overall means by DOMAIN (collapsed across time) -domain_means <- long_data %>% - group_by(DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by DOMAIN (collapsed across time):") -print(domain_means) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -print("Normality test results:") -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912153241.r b/.history/eohi1/mixed anova - domain means_20250912153241.r deleted file mode 100644 index d8ee82f..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912153241.r +++ /dev/null @@ -1,274 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -setwd("C:\Users\irina\Documents\DND\EOHI\eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_group <- long_data %>% - group_by(GROUP, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by GROUP, TIME, and DOMAIN:") -print(desc_stats_by_group) - -# Overall means by TIME (collapsed across domains) -time_means <- long_data %>% - group_by(TIME) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by TIME (collapsed across domains):") -print(time_means) - -# Overall means by DOMAIN (collapsed across time) -domain_means <- long_data %>% - group_by(DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by DOMAIN (collapsed across time):") -print(domain_means) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -print("Normality test results:") -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912153323.r b/.history/eohi1/mixed anova - domain means_20250912153323.r deleted file mode 100644 index 2e14e93..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912153323.r +++ /dev/null @@ -1,274 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_group <- long_data %>% - group_by(GROUP, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by GROUP, TIME, and DOMAIN:") -print(desc_stats_by_group) - -# Overall means by TIME (collapsed across domains) -time_means <- long_data %>% - group_by(TIME) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by TIME (collapsed across domains):") -print(time_means) - -# Overall means by DOMAIN (collapsed across time) -domain_means <- long_data %>% - group_by(DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by DOMAIN (collapsed across time):") -print(domain_means) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -print("Normality test results:") -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912153326.r b/.history/eohi1/mixed anova - domain means_20250912153326.r deleted file mode 100644 index 2e14e93..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912153326.r +++ /dev/null @@ -1,274 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_group <- long_data %>% - group_by(GROUP, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by GROUP, TIME, and DOMAIN:") -print(desc_stats_by_group) - -# Overall means by TIME (collapsed across domains) -time_means <- long_data %>% - group_by(TIME) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by TIME (collapsed across domains):") -print(time_means) - -# Overall means by DOMAIN (collapsed across time) -domain_means <- long_data %>% - group_by(DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by DOMAIN (collapsed across time):") -print(domain_means) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -print("Normality test results:") -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912153327.r b/.history/eohi1/mixed anova - domain means_20250912153327.r deleted file mode 100644 index 2e14e93..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912153327.r +++ /dev/null @@ -1,274 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_group <- long_data %>% - group_by(GROUP, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by GROUP, TIME, and DOMAIN:") -print(desc_stats_by_group) - -# Overall means by TIME (collapsed across domains) -time_means <- long_data %>% - group_by(TIME) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by TIME (collapsed across domains):") -print(time_means) - -# Overall means by DOMAIN (collapsed across time) -domain_means <- long_data %>% - group_by(DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by DOMAIN (collapsed across time):") -print(domain_means) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -print("Normality test results:") -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912153352.r b/.history/eohi1/mixed anova - domain means_20250912153352.r deleted file mode 100644 index a44f1b6..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912153352.r +++ /dev/null @@ -1,274 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_group <- long_data %>% - group_by(GROUP, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by GROUP, TIME, and DOMAIN:") -print(desc_stats_by_group) - -# Overall means by TIME (collapsed across domains) -time_means <- long_data %>% - group_by(TIME) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by TIME (collapsed across domains):") -print(time_means) - -# Overall means by DOMAIN (collapsed across time) -domain_means <- long_data %>% - group_by(DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by DOMAIN (collapsed across time):") -print(domain_means) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -print("Normality test results:") -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912153354.r b/.history/eohi1/mixed anova - domain means_20250912153354.r deleted file mode 100644 index a44f1b6..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912153354.r +++ /dev/null @@ -1,274 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_group <- long_data %>% - group_by(GROUP, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by GROUP, TIME, and DOMAIN:") -print(desc_stats_by_group) - -# Overall means by TIME (collapsed across domains) -time_means <- long_data %>% - group_by(TIME) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by TIME (collapsed across domains):") -print(time_means) - -# Overall means by DOMAIN (collapsed across time) -domain_means <- long_data %>% - group_by(DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by DOMAIN (collapsed across time):") -print(domain_means) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -print("Normality test results:") -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912153402.r b/.history/eohi1/mixed anova - domain means_20250912153402.r deleted file mode 100644 index a44f1b6..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912153402.r +++ /dev/null @@ -1,274 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_group <- long_data %>% - group_by(GROUP, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by GROUP, TIME, and DOMAIN:") -print(desc_stats_by_group) - -# Overall means by TIME (collapsed across domains) -time_means <- long_data %>% - group_by(TIME) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by TIME (collapsed across domains):") -print(time_means) - -# Overall means by DOMAIN (collapsed across time) -domain_means <- long_data %>% - group_by(DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by DOMAIN (collapsed across time):") -print(domain_means) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -print("Normality test results:") -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912154157.r b/.history/eohi1/mixed anova - domain means_20250912154157.r deleted file mode 100644 index 2bcbe1a..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912154157.r +++ /dev/null @@ -1,274 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - -# Overall means by TIME (collapsed across domains) -time_means <- long_data %>% - group_by(TIME) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by TIME (collapsed across domains):") -print(time_means) - -# Overall means by DOMAIN (collapsed across time) -domain_means <- long_data %>% - group_by(DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by DOMAIN (collapsed across time):") -print(domain_means) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -print("Normality test results:") -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912154200.r b/.history/eohi1/mixed anova - domain means_20250912154200.r deleted file mode 100644 index 2bcbe1a..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912154200.r +++ /dev/null @@ -1,274 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - -# Overall means by TIME (collapsed across domains) -time_means <- long_data %>% - group_by(TIME) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by TIME (collapsed across domains):") -print(time_means) - -# Overall means by DOMAIN (collapsed across time) -domain_means <- long_data %>% - group_by(DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by DOMAIN (collapsed across time):") -print(domain_means) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -print("Normality test results:") -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912154202.r b/.history/eohi1/mixed anova - domain means_20250912154202.r deleted file mode 100644 index 2bcbe1a..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912154202.r +++ /dev/null @@ -1,274 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - -# Overall means by TIME (collapsed across domains) -time_means <- long_data %>% - group_by(TIME) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by TIME (collapsed across domains):") -print(time_means) - -# Overall means by DOMAIN (collapsed across time) -domain_means <- long_data %>% - group_by(DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by DOMAIN (collapsed across time):") -print(domain_means) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -print("Normality test results:") -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912154303.r b/.history/eohi1/mixed anova - domain means_20250912154303.r deleted file mode 100644 index 3d5f420..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912154303.r +++ /dev/null @@ -1,247 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -print("Normality test results:") -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912154304.r b/.history/eohi1/mixed anova - domain means_20250912154304.r deleted file mode 100644 index 3d5f420..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912154304.r +++ /dev/null @@ -1,247 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -print("Normality test results:") -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912154305.r b/.history/eohi1/mixed anova - domain means_20250912154305.r deleted file mode 100644 index 3d5f420..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912154305.r +++ /dev/null @@ -1,247 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -print("Normality test results:") -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912154904.r b/.history/eohi1/mixed anova - domain means_20250912154904.r deleted file mode 100644 index 4c9cfa3..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912154904.r +++ /dev/null @@ -1,247 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -print("Normality test results:") -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912154909.r b/.history/eohi1/mixed anova - domain means_20250912154909.r deleted file mode 100644 index 4c9cfa3..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912154909.r +++ /dev/null @@ -1,247 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -print("Normality test results:") -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912155010.r b/.history/eohi1/mixed anova - domain means_20250912155010.r deleted file mode 100644 index 019c325..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912155010.r +++ /dev/null @@ -1,246 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912155017.r b/.history/eohi1/mixed anova - domain means_20250912155017.r deleted file mode 100644 index 019c325..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912155017.r +++ /dev/null @@ -1,246 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912155100.r b/.history/eohi1/mixed anova - domain means_20250912155100.r deleted file mode 100644 index 019c325..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912155100.r +++ /dev/null @@ -1,246 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912155158.r b/.history/eohi1/mixed anova - domain means_20250912155158.r deleted file mode 100644 index 49e13c0..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912155158.r +++ /dev/null @@ -1,246 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - round(ad.test(MEAN_DIFFERENCE)$p.value, 20), - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912155205.r b/.history/eohi1/mixed anova - domain means_20250912155205.r deleted file mode 100644 index 49e13c0..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912155205.r +++ /dev/null @@ -1,246 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - round(ad.test(MEAN_DIFFERENCE)$p.value, 20), - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912155217.r b/.history/eohi1/mixed anova - domain means_20250912155217.r deleted file mode 100644 index 067fbb0..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912155217.r +++ /dev/null @@ -1,263 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - round(ad.test(MEAN_DIFFERENCE)$p.value, 20), - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") -debug_ad <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_test = list(ad.test(MEAN_DIFFERENCE)), - .groups = 'drop' - ) - -# Extract individual test results -for(i in 1:nrow(debug_ad)) { - cat("TIME:", debug_ad$TIME[i], "DOMAIN:", debug_ad$DOMAIN[i], "\n") - print(debug_ad$ad_test[[i]]) - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912155223.r b/.history/eohi1/mixed anova - domain means_20250912155223.r deleted file mode 100644 index 067fbb0..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912155223.r +++ /dev/null @@ -1,263 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - round(ad.test(MEAN_DIFFERENCE)$p.value, 20), - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") -debug_ad <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_test = list(ad.test(MEAN_DIFFERENCE)), - .groups = 'drop' - ) - -# Extract individual test results -for(i in 1:nrow(debug_ad)) { - cat("TIME:", debug_ad$TIME[i], "DOMAIN:", debug_ad$DOMAIN[i], "\n") - print(debug_ad$ad_test[[i]]) - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912155224.r b/.history/eohi1/mixed anova - domain means_20250912155224.r deleted file mode 100644 index 067fbb0..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912155224.r +++ /dev/null @@ -1,263 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - round(ad.test(MEAN_DIFFERENCE)$p.value, 20), - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") -debug_ad <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_test = list(ad.test(MEAN_DIFFERENCE)), - .groups = 'drop' - ) - -# Extract individual test results -for(i in 1:nrow(debug_ad)) { - cat("TIME:", debug_ad$TIME[i], "DOMAIN:", debug_ad$DOMAIN[i], "\n") - print(debug_ad$ad_test[[i]]) - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912155246.r b/.history/eohi1/mixed anova - domain means_20250912155246.r deleted file mode 100644 index 7a33961..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912155246.r +++ /dev/null @@ -1,275 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - round(ad.test(MEAN_DIFFERENCE)$p.value, 20), - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912155247.r b/.history/eohi1/mixed anova - domain means_20250912155247.r deleted file mode 100644 index 7a33961..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912155247.r +++ /dev/null @@ -1,275 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - round(ad.test(MEAN_DIFFERENCE)$p.value, 20), - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912155253.r b/.history/eohi1/mixed anova - domain means_20250912155253.r deleted file mode 100644 index 7a33961..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912155253.r +++ /dev/null @@ -1,275 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - round(ad.test(MEAN_DIFFERENCE)$p.value, 20), - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912155329.r b/.history/eohi1/mixed anova - domain means_20250912155329.r deleted file mode 100644 index 4c7c829..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912155329.r +++ /dev/null @@ -1,275 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912155333.r b/.history/eohi1/mixed anova - domain means_20250912155333.r deleted file mode 100644 index 5a459a3..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912155333.r +++ /dev/null @@ -1,279 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912155338.r b/.history/eohi1/mixed anova - domain means_20250912155338.r deleted file mode 100644 index 5a459a3..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912155338.r +++ /dev/null @@ -1,279 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912155352.r b/.history/eohi1/mixed anova - domain means_20250912155352.r deleted file mode 100644 index 5a459a3..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912155352.r +++ /dev/null @@ -1,279 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/eohi1/mixed anova - domain means_20250912155830.r b/.history/eohi1/mixed anova - domain means_20250912155830.r deleted file mode 100644 index 454c709..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912155830.r +++ /dev/null @@ -1,367 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Pool all data and resample to create equal variance scenario - pooled_data <- data[[response_var]] - pooled_mean <- mean(pooled_data, na.rm = TRUE) - pooled_var <- var(pooled_data, na.rm = TRUE) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Generate samples from normal distribution with pooled variance - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - var(rnorm(n = n_group, mean = pooled_mean, sd = sqrt(pooled_var))) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - critical_95 <- quantile(bootstrap_ratios, 0.95) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912155832.r b/.history/eohi1/mixed anova - domain means_20250912155832.r deleted file mode 100644 index 454c709..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912155832.r +++ /dev/null @@ -1,367 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Pool all data and resample to create equal variance scenario - pooled_data <- data[[response_var]] - pooled_mean <- mean(pooled_data, na.rm = TRUE) - pooled_var <- var(pooled_data, na.rm = TRUE) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Generate samples from normal distribution with pooled variance - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - var(rnorm(n = n_group, mean = pooled_mean, sd = sqrt(pooled_var))) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - critical_95 <- quantile(bootstrap_ratios, 0.95) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912155837.r b/.history/eohi1/mixed anova - domain means_20250912155837.r deleted file mode 100644 index 454c709..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912155837.r +++ /dev/null @@ -1,367 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Pool all data and resample to create equal variance scenario - pooled_data <- data[[response_var]] - pooled_mean <- mean(pooled_data, na.rm = TRUE) - pooled_var <- var(pooled_data, na.rm = TRUE) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Generate samples from normal distribution with pooled variance - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - var(rnorm(n = n_group, mean = pooled_mean, sd = sqrt(pooled_var))) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - critical_95 <- quantile(bootstrap_ratios, 0.95) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912155915.r b/.history/eohi1/mixed anova - domain means_20250912155915.r deleted file mode 100644 index bbb4b9d..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912155915.r +++ /dev/null @@ -1,374 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Pool all data and resample to create equal variance scenario - pooled_data <- data[[response_var]] - pooled_mean <- mean(pooled_data, na.rm = TRUE) - pooled_var <- var(pooled_data, na.rm = TRUE) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Generate samples from normal distribution with pooled variance - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - var(rnorm(n = n_group, mean = pooled_mean, sd = sqrt(pooled_var))) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912155919.r b/.history/eohi1/mixed anova - domain means_20250912155919.r deleted file mode 100644 index bbb4b9d..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912155919.r +++ /dev/null @@ -1,374 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Pool all data and resample to create equal variance scenario - pooled_data <- data[[response_var]] - pooled_mean <- mean(pooled_data, na.rm = TRUE) - pooled_var <- var(pooled_data, na.rm = TRUE) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Generate samples from normal distribution with pooled variance - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - var(rnorm(n = n_group, mean = pooled_mean, sd = sqrt(pooled_var))) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912155922.r b/.history/eohi1/mixed anova - domain means_20250912155922.r deleted file mode 100644 index bbb4b9d..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912155922.r +++ /dev/null @@ -1,374 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Pool all data and resample to create equal variance scenario - pooled_data <- data[[response_var]] - pooled_mean <- mean(pooled_data, na.rm = TRUE) - pooled_var <- var(pooled_data, na.rm = TRUE) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Generate samples from normal distribution with pooled variance - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - var(rnorm(n = n_group, mean = pooled_mean, sd = sqrt(pooled_var))) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912155944.r b/.history/eohi1/mixed anova - domain means_20250912155944.r deleted file mode 100644 index 5c9b71c..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912155944.r +++ /dev/null @@ -1,378 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Pool all data and resample to create equal variance scenario - pooled_data <- data[[response_var]] - pooled_mean <- mean(pooled_data, na.rm = TRUE) - pooled_var <- var(pooled_data, na.rm = TRUE) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Generate samples from normal distribution with pooled variance - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - # Ensure we have a valid variance (avoid zero variance) - sample_var <- var(rnorm(n = n_group, mean = pooled_mean, sd = sqrt(pooled_var))) - # If variance is zero or very small, add small epsilon - if(sample_var <= 0) sample_var <- 1e-10 - sample_var - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912155948.r b/.history/eohi1/mixed anova - domain means_20250912155948.r deleted file mode 100644 index a26930b..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912155948.r +++ /dev/null @@ -1,384 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Pool all data and resample to create equal variance scenario - pooled_data <- data[[response_var]] - pooled_mean <- mean(pooled_data, na.rm = TRUE) - pooled_var <- var(pooled_data, na.rm = TRUE) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Generate samples from normal distribution with pooled variance - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - # Ensure we have a valid variance (avoid zero variance) - sample_var <- var(rnorm(n = n_group, mean = pooled_mean, sd = sqrt(pooled_var))) - # If variance is zero or very small, add small epsilon - if(sample_var <= 0) sample_var <- 1e-10 - sample_var - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912155953.r b/.history/eohi1/mixed anova - domain means_20250912155953.r deleted file mode 100644 index a26930b..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912155953.r +++ /dev/null @@ -1,384 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Pool all data and resample to create equal variance scenario - pooled_data <- data[[response_var]] - pooled_mean <- mean(pooled_data, na.rm = TRUE) - pooled_var <- var(pooled_data, na.rm = TRUE) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Generate samples from normal distribution with pooled variance - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - # Ensure we have a valid variance (avoid zero variance) - sample_var <- var(rnorm(n = n_group, mean = pooled_mean, sd = sqrt(pooled_var))) - # If variance is zero or very small, add small epsilon - if(sample_var <= 0) sample_var <- 1e-10 - sample_var - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912160029.r b/.history/eohi1/mixed anova - domain means_20250912160029.r deleted file mode 100644 index df6b53c..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912160029.r +++ /dev/null @@ -1,380 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Resample from actual data to create equal variance scenario - pooled_data <- data[[response_var]] - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Resample from pooled data for each group - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - # Bootstrap sample from pooled data - bootstrap_sample <- sample(pooled_data, size = n_group, replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912160033.r b/.history/eohi1/mixed anova - domain means_20250912160033.r deleted file mode 100644 index df6b53c..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912160033.r +++ /dev/null @@ -1,380 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Resample from actual data to create equal variance scenario - pooled_data <- data[[response_var]] - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Resample from pooled data for each group - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - # Bootstrap sample from pooled data - bootstrap_sample <- sample(pooled_data, size = n_group, replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912160053.r b/.history/eohi1/mixed anova - domain means_20250912160053.r deleted file mode 100644 index df6b53c..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912160053.r +++ /dev/null @@ -1,380 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Resample from actual data to create equal variance scenario - pooled_data <- data[[response_var]] - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Resample from pooled data for each group - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - # Bootstrap sample from pooled data - bootstrap_sample <- sample(pooled_data, size = n_group, replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912160118.r b/.history/eohi1/mixed anova - domain means_20250912160118.r deleted file mode 100644 index 42e21be..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912160118.r +++ /dev/null @@ -1,387 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Resample from actual data to create equal variance scenario - pooled_data <- data[[response_var]] - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Resample from pooled data for each group - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - # Bootstrap sample from pooled data - bootstrap_sample <- sample(pooled_data, size = n_group, replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912160125.r b/.history/eohi1/mixed anova - domain means_20250912160125.r deleted file mode 100644 index 4cfd45f..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912160125.r +++ /dev/null @@ -1,398 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Resample from actual data to create equal variance scenario - pooled_data <- data[[response_var]] - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Resample from pooled data for each group - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - # Bootstrap sample from pooled data - bootstrap_sample <- sample(pooled_data, size = n_group, replace = TRUE) - sample_var <- var(bootstrap_sample, na.rm = TRUE) - # Debug: Check for zero variance - if(sample_var == 0) { - cat("Warning: Zero variance detected in bootstrap sample\n") - sample_var <- 1e-10 - } - sample_var - }) - hartley_ratio <- calculate_hartley_ratio(sample_vars) - # Debug: Check for invalid ratios - if(!is.finite(hartley_ratio)) { - cat("Warning: Invalid Hartley ratio:", hartley_ratio, "Sample vars:", sample_vars, "\n") - } - hartley_ratio - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912160131.r b/.history/eohi1/mixed anova - domain means_20250912160131.r deleted file mode 100644 index 4cfd45f..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912160131.r +++ /dev/null @@ -1,398 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Resample from actual data to create equal variance scenario - pooled_data <- data[[response_var]] - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Resample from pooled data for each group - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - # Bootstrap sample from pooled data - bootstrap_sample <- sample(pooled_data, size = n_group, replace = TRUE) - sample_var <- var(bootstrap_sample, na.rm = TRUE) - # Debug: Check for zero variance - if(sample_var == 0) { - cat("Warning: Zero variance detected in bootstrap sample\n") - sample_var <- 1e-10 - } - sample_var - }) - hartley_ratio <- calculate_hartley_ratio(sample_vars) - # Debug: Check for invalid ratios - if(!is.finite(hartley_ratio)) { - cat("Warning: Invalid Hartley ratio:", hartley_ratio, "Sample vars:", sample_vars, "\n") - } - hartley_ratio - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912160133.r b/.history/eohi1/mixed anova - domain means_20250912160133.r deleted file mode 100644 index 4cfd45f..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912160133.r +++ /dev/null @@ -1,398 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Resample from actual data to create equal variance scenario - pooled_data <- data[[response_var]] - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Resample from pooled data for each group - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - # Bootstrap sample from pooled data - bootstrap_sample <- sample(pooled_data, size = n_group, replace = TRUE) - sample_var <- var(bootstrap_sample, na.rm = TRUE) - # Debug: Check for zero variance - if(sample_var == 0) { - cat("Warning: Zero variance detected in bootstrap sample\n") - sample_var <- 1e-10 - } - sample_var - }) - hartley_ratio <- calculate_hartley_ratio(sample_vars) - # Debug: Check for invalid ratios - if(!is.finite(hartley_ratio)) { - cat("Warning: Invalid Hartley ratio:", hartley_ratio, "Sample vars:", sample_vars, "\n") - } - hartley_ratio - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912160206.r b/.history/eohi1/mixed anova - domain means_20250912160206.r deleted file mode 100644 index db23141..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912160206.r +++ /dev/null @@ -1,438 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS -# ============================================================================= - -# Test 1: Observed F-max across TIME within each DOMAIN -print("=== OBSERVED F-MAX RATIOS: TIME within each DOMAIN ===") - -observed_time_ratios <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - # Calculate variances for each TIME level within this domain - past_var = var(MEAN_DIFFERENCE[TIME == "Past"], na.rm = TRUE), - future_var = var(MEAN_DIFFERENCE[TIME == "Future"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, future_var) / min(past_var, future_var), - .groups = 'drop' - ) %>% - select(DOMAIN, past_var, future_var, f_max_ratio) - -print(observed_time_ratios) - -# Test 2: Observed F-max across DOMAIN within each TIME -print("\n=== OBSERVED F-MAX RATIOS: DOMAIN within each TIME ===") - -observed_domain_ratios <- long_data_clean %>% - group_by(TIME) %>% - summarise( - # Calculate variances for each DOMAIN level within this time - pref_var = var(MEAN_DIFFERENCE[DOMAIN == "Preferences"], na.rm = TRUE), - pers_var = var(MEAN_DIFFERENCE[DOMAIN == "Personality"], na.rm = TRUE), - val_var = var(MEAN_DIFFERENCE[DOMAIN == "Values"], na.rm = TRUE), - life_var = var(MEAN_DIFFERENCE[DOMAIN == "Life"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(pref_var, pers_var, val_var, life_var) / min(pref_var, pers_var, val_var, life_var), - .groups = 'drop' - ) %>% - select(TIME, pref_var, pers_var, val_var, life_var, f_max_ratio) - -print(observed_domain_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Resample from actual data to create equal variance scenario - pooled_data <- data[[response_var]] - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Resample from pooled data for each group - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - # Bootstrap sample from pooled data - bootstrap_sample <- sample(pooled_data, size = n_group, replace = TRUE) - sample_var <- var(bootstrap_sample, na.rm = TRUE) - # Debug: Check for zero variance - if(sample_var == 0) { - cat("Warning: Zero variance detected in bootstrap sample\n") - sample_var <- 1e-10 - } - sample_var - }) - hartley_ratio <- calculate_hartley_ratio(sample_vars) - # Debug: Check for invalid ratios - if(!is.finite(hartley_ratio)) { - cat("Warning: Invalid Hartley ratio:", hartley_ratio, "Sample vars:", sample_vars, "\n") - } - hartley_ratio - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912160213.r b/.history/eohi1/mixed anova - domain means_20250912160213.r deleted file mode 100644 index db23141..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912160213.r +++ /dev/null @@ -1,438 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS -# ============================================================================= - -# Test 1: Observed F-max across TIME within each DOMAIN -print("=== OBSERVED F-MAX RATIOS: TIME within each DOMAIN ===") - -observed_time_ratios <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - # Calculate variances for each TIME level within this domain - past_var = var(MEAN_DIFFERENCE[TIME == "Past"], na.rm = TRUE), - future_var = var(MEAN_DIFFERENCE[TIME == "Future"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, future_var) / min(past_var, future_var), - .groups = 'drop' - ) %>% - select(DOMAIN, past_var, future_var, f_max_ratio) - -print(observed_time_ratios) - -# Test 2: Observed F-max across DOMAIN within each TIME -print("\n=== OBSERVED F-MAX RATIOS: DOMAIN within each TIME ===") - -observed_domain_ratios <- long_data_clean %>% - group_by(TIME) %>% - summarise( - # Calculate variances for each DOMAIN level within this time - pref_var = var(MEAN_DIFFERENCE[DOMAIN == "Preferences"], na.rm = TRUE), - pers_var = var(MEAN_DIFFERENCE[DOMAIN == "Personality"], na.rm = TRUE), - val_var = var(MEAN_DIFFERENCE[DOMAIN == "Values"], na.rm = TRUE), - life_var = var(MEAN_DIFFERENCE[DOMAIN == "Life"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(pref_var, pers_var, val_var, life_var) / min(pref_var, pers_var, val_var, life_var), - .groups = 'drop' - ) %>% - select(TIME, pref_var, pers_var, val_var, life_var, f_max_ratio) - -print(observed_domain_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Resample from actual data to create equal variance scenario - pooled_data <- data[[response_var]] - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Resample from pooled data for each group - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - # Bootstrap sample from pooled data - bootstrap_sample <- sample(pooled_data, size = n_group, replace = TRUE) - sample_var <- var(bootstrap_sample, na.rm = TRUE) - # Debug: Check for zero variance - if(sample_var == 0) { - cat("Warning: Zero variance detected in bootstrap sample\n") - sample_var <- 1e-10 - } - sample_var - }) - hartley_ratio <- calculate_hartley_ratio(sample_vars) - # Debug: Check for invalid ratios - if(!is.finite(hartley_ratio)) { - cat("Warning: Invalid Hartley ratio:", hartley_ratio, "Sample vars:", sample_vars, "\n") - } - hartley_ratio - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912160217.r b/.history/eohi1/mixed anova - domain means_20250912160217.r deleted file mode 100644 index db23141..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912160217.r +++ /dev/null @@ -1,438 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS -# ============================================================================= - -# Test 1: Observed F-max across TIME within each DOMAIN -print("=== OBSERVED F-MAX RATIOS: TIME within each DOMAIN ===") - -observed_time_ratios <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - # Calculate variances for each TIME level within this domain - past_var = var(MEAN_DIFFERENCE[TIME == "Past"], na.rm = TRUE), - future_var = var(MEAN_DIFFERENCE[TIME == "Future"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, future_var) / min(past_var, future_var), - .groups = 'drop' - ) %>% - select(DOMAIN, past_var, future_var, f_max_ratio) - -print(observed_time_ratios) - -# Test 2: Observed F-max across DOMAIN within each TIME -print("\n=== OBSERVED F-MAX RATIOS: DOMAIN within each TIME ===") - -observed_domain_ratios <- long_data_clean %>% - group_by(TIME) %>% - summarise( - # Calculate variances for each DOMAIN level within this time - pref_var = var(MEAN_DIFFERENCE[DOMAIN == "Preferences"], na.rm = TRUE), - pers_var = var(MEAN_DIFFERENCE[DOMAIN == "Personality"], na.rm = TRUE), - val_var = var(MEAN_DIFFERENCE[DOMAIN == "Values"], na.rm = TRUE), - life_var = var(MEAN_DIFFERENCE[DOMAIN == "Life"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(pref_var, pers_var, val_var, life_var) / min(pref_var, pers_var, val_var, life_var), - .groups = 'drop' - ) %>% - select(TIME, pref_var, pers_var, val_var, life_var, f_max_ratio) - -print(observed_domain_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Resample from actual data to create equal variance scenario - pooled_data <- data[[response_var]] - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Resample from pooled data for each group - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - # Bootstrap sample from pooled data - bootstrap_sample <- sample(pooled_data, size = n_group, replace = TRUE) - sample_var <- var(bootstrap_sample, na.rm = TRUE) - # Debug: Check for zero variance - if(sample_var == 0) { - cat("Warning: Zero variance detected in bootstrap sample\n") - sample_var <- 1e-10 - } - sample_var - }) - hartley_ratio <- calculate_hartley_ratio(sample_vars) - # Debug: Check for invalid ratios - if(!is.finite(hartley_ratio)) { - cat("Warning: Invalid Hartley ratio:", hartley_ratio, "Sample vars:", sample_vars, "\n") - } - hartley_ratio - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912160416.r b/.history/eohi1/mixed anova - domain means_20250912160416.r deleted file mode 100644 index c46be07..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912160416.r +++ /dev/null @@ -1,438 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS -# ============================================================================= - -# Test 1: Observed F-max across TIME within each DOMAIN -print("=== OBSERVED F-MAX RATIOS: TIME within each DOMAIN ===") - -observed_time_ratios <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - # Calculate variances for each TIME level within this domain - past_var = var(MEAN_DIFFERENCE[TIME == "Past"], na.rm = TRUE), - future_var = var(MEAN_DIFFERENCE[TIME == "Future"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, future_var) / min(past_var, future_var), - .groups = 'drop' - ) %>% - select(DOMAIN, past_var, future_var, f_max_ratio) - -print(observed_time_ratios) - -# Test 2: Observed F-max across DOMAIN within each TIME -print("\n=== OBSERVED F-MAX RATIOS: DOMAIN within each TIME ===") - -observed_domain_ratios <- long_data_clean %>% - group_by(TIME) %>% - summarise( - # Calculate variances for each DOMAIN level within this time - pref_var = var(MEAN_DIFFERENCE[DOMAIN == "Preferences"], na.rm = TRUE), - pers_var = var(MEAN_DIFFERENCE[DOMAIN == "Personality"], na.rm = TRUE), - val_var = var(MEAN_DIFFERENCE[DOMAIN == "Values"], na.rm = TRUE), - life_var = var(MEAN_DIFFERENCE[DOMAIN == "Life"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(pref_var, pers_var, val_var, life_var) / min(pref_var, pers_var, val_var, life_var), - .groups = 'drop' - ) %>% - select(TIME, pref_var, pers_var, val_var, life_var, f_max_ratio) - -print(observed_domain_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Resample from actual data to create equal variance scenario - pooled_data <- data[[response_var]] - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Resample from pooled data for each group - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - # Bootstrap sample from pooled data - bootstrap_sample <- sample(pooled_data, size = n_group, replace = TRUE) - sample_var <- var(bootstrap_sample, na.rm = TRUE) - # Debug: Check for zero variance or NA - if(is.na(sample_var) || sample_var == 0) { - cat("Warning: Zero or NA variance detected in bootstrap sample\n") - sample_var <- 1e-10 - } - sample_var - }) - hartley_ratio <- calculate_hartley_ratio(sample_vars) - # Debug: Check for invalid ratios - if(!is.finite(hartley_ratio)) { - cat("Warning: Invalid Hartley ratio:", hartley_ratio, "Sample vars:", sample_vars, "\n") - } - hartley_ratio - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912160420.r b/.history/eohi1/mixed anova - domain means_20250912160420.r deleted file mode 100644 index c46be07..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912160420.r +++ /dev/null @@ -1,438 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS -# ============================================================================= - -# Test 1: Observed F-max across TIME within each DOMAIN -print("=== OBSERVED F-MAX RATIOS: TIME within each DOMAIN ===") - -observed_time_ratios <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - # Calculate variances for each TIME level within this domain - past_var = var(MEAN_DIFFERENCE[TIME == "Past"], na.rm = TRUE), - future_var = var(MEAN_DIFFERENCE[TIME == "Future"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, future_var) / min(past_var, future_var), - .groups = 'drop' - ) %>% - select(DOMAIN, past_var, future_var, f_max_ratio) - -print(observed_time_ratios) - -# Test 2: Observed F-max across DOMAIN within each TIME -print("\n=== OBSERVED F-MAX RATIOS: DOMAIN within each TIME ===") - -observed_domain_ratios <- long_data_clean %>% - group_by(TIME) %>% - summarise( - # Calculate variances for each DOMAIN level within this time - pref_var = var(MEAN_DIFFERENCE[DOMAIN == "Preferences"], na.rm = TRUE), - pers_var = var(MEAN_DIFFERENCE[DOMAIN == "Personality"], na.rm = TRUE), - val_var = var(MEAN_DIFFERENCE[DOMAIN == "Values"], na.rm = TRUE), - life_var = var(MEAN_DIFFERENCE[DOMAIN == "Life"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(pref_var, pers_var, val_var, life_var) / min(pref_var, pers_var, val_var, life_var), - .groups = 'drop' - ) %>% - select(TIME, pref_var, pers_var, val_var, life_var, f_max_ratio) - -print(observed_domain_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Resample from actual data to create equal variance scenario - pooled_data <- data[[response_var]] - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Resample from pooled data for each group - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - # Bootstrap sample from pooled data - bootstrap_sample <- sample(pooled_data, size = n_group, replace = TRUE) - sample_var <- var(bootstrap_sample, na.rm = TRUE) - # Debug: Check for zero variance or NA - if(is.na(sample_var) || sample_var == 0) { - cat("Warning: Zero or NA variance detected in bootstrap sample\n") - sample_var <- 1e-10 - } - sample_var - }) - hartley_ratio <- calculate_hartley_ratio(sample_vars) - # Debug: Check for invalid ratios - if(!is.finite(hartley_ratio)) { - cat("Warning: Invalid Hartley ratio:", hartley_ratio, "Sample vars:", sample_vars, "\n") - } - hartley_ratio - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912160432.r b/.history/eohi1/mixed anova - domain means_20250912160432.r deleted file mode 100644 index c46be07..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912160432.r +++ /dev/null @@ -1,438 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS -# ============================================================================= - -# Test 1: Observed F-max across TIME within each DOMAIN -print("=== OBSERVED F-MAX RATIOS: TIME within each DOMAIN ===") - -observed_time_ratios <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - # Calculate variances for each TIME level within this domain - past_var = var(MEAN_DIFFERENCE[TIME == "Past"], na.rm = TRUE), - future_var = var(MEAN_DIFFERENCE[TIME == "Future"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, future_var) / min(past_var, future_var), - .groups = 'drop' - ) %>% - select(DOMAIN, past_var, future_var, f_max_ratio) - -print(observed_time_ratios) - -# Test 2: Observed F-max across DOMAIN within each TIME -print("\n=== OBSERVED F-MAX RATIOS: DOMAIN within each TIME ===") - -observed_domain_ratios <- long_data_clean %>% - group_by(TIME) %>% - summarise( - # Calculate variances for each DOMAIN level within this time - pref_var = var(MEAN_DIFFERENCE[DOMAIN == "Preferences"], na.rm = TRUE), - pers_var = var(MEAN_DIFFERENCE[DOMAIN == "Personality"], na.rm = TRUE), - val_var = var(MEAN_DIFFERENCE[DOMAIN == "Values"], na.rm = TRUE), - life_var = var(MEAN_DIFFERENCE[DOMAIN == "Life"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(pref_var, pers_var, val_var, life_var) / min(pref_var, pers_var, val_var, life_var), - .groups = 'drop' - ) %>% - select(TIME, pref_var, pers_var, val_var, life_var, f_max_ratio) - -print(observed_domain_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Resample from actual data to create equal variance scenario - pooled_data <- data[[response_var]] - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Resample from pooled data for each group - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - # Bootstrap sample from pooled data - bootstrap_sample <- sample(pooled_data, size = n_group, replace = TRUE) - sample_var <- var(bootstrap_sample, na.rm = TRUE) - # Debug: Check for zero variance or NA - if(is.na(sample_var) || sample_var == 0) { - cat("Warning: Zero or NA variance detected in bootstrap sample\n") - sample_var <- 1e-10 - } - sample_var - }) - hartley_ratio <- calculate_hartley_ratio(sample_vars) - # Debug: Check for invalid ratios - if(!is.finite(hartley_ratio)) { - cat("Warning: Invalid Hartley ratio:", hartley_ratio, "Sample vars:", sample_vars, "\n") - } - hartley_ratio - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912160551.r b/.history/eohi1/mixed anova - domain means_20250912160551.r deleted file mode 100644 index eb14a7a..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912160551.r +++ /dev/null @@ -1,438 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS -# ============================================================================= - -# Test 1: Observed F-max across TIME within each DOMAIN -print("=== OBSERVED F-MAX RATIOS: TIME within each DOMAIN ===") - -observed_time_ratios <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - # Calculate variances for each TIME level within this domain - past_var = var(MEAN_DIFFERENCE[TIME == "Past"], na.rm = TRUE), - future_var = var(MEAN_DIFFERENCE[TIME == "Future"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, future_var) / min(past_var, future_var), - .groups = 'drop' - ) %>% - select(DOMAIN, past_var, future_var, f_max_ratio) - -print(observed_time_ratios) - -# Test 2: Observed F-max across DOMAIN within each TIME -print("\n=== OBSERVED F-MAX RATIOS: DOMAIN within each TIME ===") - -observed_domain_ratios <- long_data_clean %>% - group_by(TIME) %>% - summarise( - # Calculate variances for each DOMAIN level within this time - pref_var = var(MEAN_DIFFERENCE[DOMAIN == "Preferences"], na.rm = TRUE), - pers_var = var(MEAN_DIFFERENCE[DOMAIN == "Personality"], na.rm = TRUE), - val_var = var(MEAN_DIFFERENCE[DOMAIN == "Values"], na.rm = TRUE), - life_var = var(MEAN_DIFFERENCE[DOMAIN == "Life"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(pref_var, pers_var, val_var, life_var) / min(pref_var, pers_var, val_var, life_var), - .groups = 'drop' - ) %>% - select(TIME, pref_var, pers_var, val_var, life_var, f_max_ratio) - -print(observed_domain_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Resample from actual data to create equal variance scenario - pooled_data <- data[[response_var]] - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Resample from pooled data for each group - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - # Bootstrap sample from pooled data - bootstrap_sample <- sample(pooled_data, size = n_group, replace = TRUE) - sample_var <- var(bootstrap_sample, na.rm = TRUE) - # Debug: Check for zero variance or NA - if(is.na(sample_var) || sample_var == 0) { - cat("Warning: Zero or NA variance detected in bootstrap sample\n") - sample_var <- 1e-10 - } - sample_var - }) - hartley_ratio <- calculate_hartley_ratio(sample_vars) - # Debug: Check for invalid ratios - if(!is.finite(hartley_ratio)) { - cat("Warning: Invalid Hartley ratio:", hartley_ratio, "Sample vars:", sample_vars, "\n") - } - hartley_ratio - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(cur_data(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912160604.r b/.history/eohi1/mixed anova - domain means_20250912160604.r deleted file mode 100644 index 862c149..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912160604.r +++ /dev/null @@ -1,438 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS -# ============================================================================= - -# Test 1: Observed F-max across TIME within each DOMAIN -print("=== OBSERVED F-MAX RATIOS: TIME within each DOMAIN ===") - -observed_time_ratios <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - # Calculate variances for each TIME level within this domain - past_var = var(MEAN_DIFFERENCE[TIME == "Past"], na.rm = TRUE), - future_var = var(MEAN_DIFFERENCE[TIME == "Future"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, future_var) / min(past_var, future_var), - .groups = 'drop' - ) %>% - select(DOMAIN, past_var, future_var, f_max_ratio) - -print(observed_time_ratios) - -# Test 2: Observed F-max across DOMAIN within each TIME -print("\n=== OBSERVED F-MAX RATIOS: DOMAIN within each TIME ===") - -observed_domain_ratios <- long_data_clean %>% - group_by(TIME) %>% - summarise( - # Calculate variances for each DOMAIN level within this time - pref_var = var(MEAN_DIFFERENCE[DOMAIN == "Preferences"], na.rm = TRUE), - pers_var = var(MEAN_DIFFERENCE[DOMAIN == "Personality"], na.rm = TRUE), - val_var = var(MEAN_DIFFERENCE[DOMAIN == "Values"], na.rm = TRUE), - life_var = var(MEAN_DIFFERENCE[DOMAIN == "Life"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(pref_var, pers_var, val_var, life_var) / min(pref_var, pers_var, val_var, life_var), - .groups = 'drop' - ) %>% - select(TIME, pref_var, pers_var, val_var, life_var, f_max_ratio) - -print(observed_domain_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Resample from actual data to create equal variance scenario - pooled_data <- data[[response_var]] - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Resample from pooled data for each group - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - # Bootstrap sample from pooled data - bootstrap_sample <- sample(pooled_data, size = n_group, replace = TRUE) - sample_var <- var(bootstrap_sample, na.rm = TRUE) - # Debug: Check for zero variance or NA - if(is.na(sample_var) || sample_var == 0) { - cat("Warning: Zero or NA variance detected in bootstrap sample\n") - sample_var <- 1e-10 - } - sample_var - }) - hartley_ratio <- calculate_hartley_ratio(sample_vars) - # Debug: Check for invalid ratios - if(!is.finite(hartley_ratio)) { - cat("Warning: Invalid Hartley ratio:", hartley_ratio, "Sample vars:", sample_vars, "\n") - } - hartley_ratio - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912160607.r b/.history/eohi1/mixed anova - domain means_20250912160607.r deleted file mode 100644 index 862c149..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912160607.r +++ /dev/null @@ -1,438 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS -# ============================================================================= - -# Test 1: Observed F-max across TIME within each DOMAIN -print("=== OBSERVED F-MAX RATIOS: TIME within each DOMAIN ===") - -observed_time_ratios <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - # Calculate variances for each TIME level within this domain - past_var = var(MEAN_DIFFERENCE[TIME == "Past"], na.rm = TRUE), - future_var = var(MEAN_DIFFERENCE[TIME == "Future"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, future_var) / min(past_var, future_var), - .groups = 'drop' - ) %>% - select(DOMAIN, past_var, future_var, f_max_ratio) - -print(observed_time_ratios) - -# Test 2: Observed F-max across DOMAIN within each TIME -print("\n=== OBSERVED F-MAX RATIOS: DOMAIN within each TIME ===") - -observed_domain_ratios <- long_data_clean %>% - group_by(TIME) %>% - summarise( - # Calculate variances for each DOMAIN level within this time - pref_var = var(MEAN_DIFFERENCE[DOMAIN == "Preferences"], na.rm = TRUE), - pers_var = var(MEAN_DIFFERENCE[DOMAIN == "Personality"], na.rm = TRUE), - val_var = var(MEAN_DIFFERENCE[DOMAIN == "Values"], na.rm = TRUE), - life_var = var(MEAN_DIFFERENCE[DOMAIN == "Life"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(pref_var, pers_var, val_var, life_var) / min(pref_var, pers_var, val_var, life_var), - .groups = 'drop' - ) %>% - select(TIME, pref_var, pers_var, val_var, life_var, f_max_ratio) - -print(observed_domain_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Resample from actual data to create equal variance scenario - pooled_data <- data[[response_var]] - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Resample from pooled data for each group - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - # Bootstrap sample from pooled data - bootstrap_sample <- sample(pooled_data, size = n_group, replace = TRUE) - sample_var <- var(bootstrap_sample, na.rm = TRUE) - # Debug: Check for zero variance or NA - if(is.na(sample_var) || sample_var == 0) { - cat("Warning: Zero or NA variance detected in bootstrap sample\n") - sample_var <- 1e-10 - } - sample_var - }) - hartley_ratio <- calculate_hartley_ratio(sample_vars) - # Debug: Check for invalid ratios - if(!is.finite(hartley_ratio)) { - cat("Warning: Invalid Hartley ratio:", hartley_ratio, "Sample vars:", sample_vars, "\n") - } - hartley_ratio - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912160610.r b/.history/eohi1/mixed anova - domain means_20250912160610.r deleted file mode 100644 index 862c149..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912160610.r +++ /dev/null @@ -1,438 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS -# ============================================================================= - -# Test 1: Observed F-max across TIME within each DOMAIN -print("=== OBSERVED F-MAX RATIOS: TIME within each DOMAIN ===") - -observed_time_ratios <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - # Calculate variances for each TIME level within this domain - past_var = var(MEAN_DIFFERENCE[TIME == "Past"], na.rm = TRUE), - future_var = var(MEAN_DIFFERENCE[TIME == "Future"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, future_var) / min(past_var, future_var), - .groups = 'drop' - ) %>% - select(DOMAIN, past_var, future_var, f_max_ratio) - -print(observed_time_ratios) - -# Test 2: Observed F-max across DOMAIN within each TIME -print("\n=== OBSERVED F-MAX RATIOS: DOMAIN within each TIME ===") - -observed_domain_ratios <- long_data_clean %>% - group_by(TIME) %>% - summarise( - # Calculate variances for each DOMAIN level within this time - pref_var = var(MEAN_DIFFERENCE[DOMAIN == "Preferences"], na.rm = TRUE), - pers_var = var(MEAN_DIFFERENCE[DOMAIN == "Personality"], na.rm = TRUE), - val_var = var(MEAN_DIFFERENCE[DOMAIN == "Values"], na.rm = TRUE), - life_var = var(MEAN_DIFFERENCE[DOMAIN == "Life"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(pref_var, pers_var, val_var, life_var) / min(pref_var, pers_var, val_var, life_var), - .groups = 'drop' - ) %>% - select(TIME, pref_var, pers_var, val_var, life_var, f_max_ratio) - -print(observed_domain_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Resample from actual data to create equal variance scenario - pooled_data <- data[[response_var]] - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Resample from pooled data for each group - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - # Bootstrap sample from pooled data - bootstrap_sample <- sample(pooled_data, size = n_group, replace = TRUE) - sample_var <- var(bootstrap_sample, na.rm = TRUE) - # Debug: Check for zero variance or NA - if(is.na(sample_var) || sample_var == 0) { - cat("Warning: Zero or NA variance detected in bootstrap sample\n") - sample_var <- 1e-10 - } - sample_var - }) - hartley_ratio <- calculate_hartley_ratio(sample_vars) - # Debug: Check for invalid ratios - if(!is.finite(hartley_ratio)) { - cat("Warning: Invalid Hartley ratio:", hartley_ratio, "Sample vars:", sample_vars, "\n") - } - hartley_ratio - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912160644.r b/.history/eohi1/mixed anova - domain means_20250912160644.r deleted file mode 100644 index e1dbbde..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912160644.r +++ /dev/null @@ -1,438 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS -# ============================================================================= - -# Test 1: Observed F-max across TIME within each DOMAIN -print("=== OBSERVED F-MAX RATIOS: TIME within each DOMAIN ===") - -observed_time_ratios <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - # Calculate variances for each TIME level within this domain - past_var = var(MEAN_DIFFERENCE[TIME == "Past"], na.rm = TRUE), - future_var = var(MEAN_DIFFERENCE[TIME == "Future"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, future_var) / min(past_var, future_var), - .groups = 'drop' - ) %>% - select(DOMAIN, past_var, future_var, f_max_ratio) - -print(observed_time_ratios) - -# Test 2: Observed F-max across DOMAIN within each TIME -print("\n=== OBSERVED F-MAX RATIOS: DOMAIN within each TIME ===") - -observed_domain_ratios <- long_data_clean %>% - group_by(TIME) %>% - summarise( - # Calculate variances for each DOMAIN level within this time - pref_var = var(MEAN_DIFFERENCE[DOMAIN == "Preferences"], na.rm = TRUE), - pers_var = var(MEAN_DIFFERENCE[DOMAIN == "Personality"], na.rm = TRUE), - val_var = var(MEAN_DIFFERENCE[DOMAIN == "Values"], na.rm = TRUE), - life_var = var(MEAN_DIFFERENCE[DOMAIN == "Life"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(pref_var, pers_var, val_var, life_var) / min(pref_var, pers_var, val_var, life_var), - .groups = 'drop' - ) %>% - select(TIME, pref_var, pers_var, val_var, life_var, f_max_ratio) - -print(observed_domain_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Resample from actual data to create equal variance scenario - pooled_data <- data[[response_var]] - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Resample from pooled data for each group - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - # Bootstrap sample from pooled data - bootstrap_sample <- sample(pooled_data, size = n_group, replace = TRUE) - sample_var <- var(bootstrap_sample, na.rm = TRUE) - # Debug: Check for zero variance or NA - if(is.na(sample_var) || sample_var == 0) { - cat("Warning: Zero or NA variance detected in bootstrap sample\n") - sample_var <- 1e-10 - } - sample_var - }) - hartley_ratio <- calculate_hartley_ratio(sample_vars) - # Debug: Check for invalid ratios - if(!is.finite(hartley_ratio)) { - cat("Warning: Invalid Hartley ratio:", hartley_ratio, "Sample vars:", sample_vars, "\n") - } - hartley_ratio - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TIME, MEAN_DIFFERENCE), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912160648.r b/.history/eohi1/mixed anova - domain means_20250912160648.r deleted file mode 100644 index d7bf658..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912160648.r +++ /dev/null @@ -1,438 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS -# ============================================================================= - -# Test 1: Observed F-max across TIME within each DOMAIN -print("=== OBSERVED F-MAX RATIOS: TIME within each DOMAIN ===") - -observed_time_ratios <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - # Calculate variances for each TIME level within this domain - past_var = var(MEAN_DIFFERENCE[TIME == "Past"], na.rm = TRUE), - future_var = var(MEAN_DIFFERENCE[TIME == "Future"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, future_var) / min(past_var, future_var), - .groups = 'drop' - ) %>% - select(DOMAIN, past_var, future_var, f_max_ratio) - -print(observed_time_ratios) - -# Test 2: Observed F-max across DOMAIN within each TIME -print("\n=== OBSERVED F-MAX RATIOS: DOMAIN within each TIME ===") - -observed_domain_ratios <- long_data_clean %>% - group_by(TIME) %>% - summarise( - # Calculate variances for each DOMAIN level within this time - pref_var = var(MEAN_DIFFERENCE[DOMAIN == "Preferences"], na.rm = TRUE), - pers_var = var(MEAN_DIFFERENCE[DOMAIN == "Personality"], na.rm = TRUE), - val_var = var(MEAN_DIFFERENCE[DOMAIN == "Values"], na.rm = TRUE), - life_var = var(MEAN_DIFFERENCE[DOMAIN == "Life"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(pref_var, pers_var, val_var, life_var) / min(pref_var, pers_var, val_var, life_var), - .groups = 'drop' - ) %>% - select(TIME, pref_var, pers_var, val_var, life_var, f_max_ratio) - -print(observed_domain_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Resample from actual data to create equal variance scenario - pooled_data <- data[[response_var]] - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Resample from pooled data for each group - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - # Bootstrap sample from pooled data - bootstrap_sample <- sample(pooled_data, size = n_group, replace = TRUE) - sample_var <- var(bootstrap_sample, na.rm = TRUE) - # Debug: Check for zero variance or NA - if(is.na(sample_var) || sample_var == 0) { - cat("Warning: Zero or NA variance detected in bootstrap sample\n") - sample_var <- 1e-10 - } - sample_var - }) - hartley_ratio <- calculate_hartley_ratio(sample_vars) - # Debug: Check for invalid ratios - if(!is.finite(hartley_ratio)) { - cat("Warning: Invalid Hartley ratio:", hartley_ratio, "Sample vars:", sample_vars, "\n") - } - hartley_ratio - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TIME, MEAN_DIFFERENCE), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(DOMAIN, MEAN_DIFFERENCE), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912160652.r b/.history/eohi1/mixed anova - domain means_20250912160652.r deleted file mode 100644 index d7bf658..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912160652.r +++ /dev/null @@ -1,438 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS -# ============================================================================= - -# Test 1: Observed F-max across TIME within each DOMAIN -print("=== OBSERVED F-MAX RATIOS: TIME within each DOMAIN ===") - -observed_time_ratios <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - # Calculate variances for each TIME level within this domain - past_var = var(MEAN_DIFFERENCE[TIME == "Past"], na.rm = TRUE), - future_var = var(MEAN_DIFFERENCE[TIME == "Future"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, future_var) / min(past_var, future_var), - .groups = 'drop' - ) %>% - select(DOMAIN, past_var, future_var, f_max_ratio) - -print(observed_time_ratios) - -# Test 2: Observed F-max across DOMAIN within each TIME -print("\n=== OBSERVED F-MAX RATIOS: DOMAIN within each TIME ===") - -observed_domain_ratios <- long_data_clean %>% - group_by(TIME) %>% - summarise( - # Calculate variances for each DOMAIN level within this time - pref_var = var(MEAN_DIFFERENCE[DOMAIN == "Preferences"], na.rm = TRUE), - pers_var = var(MEAN_DIFFERENCE[DOMAIN == "Personality"], na.rm = TRUE), - val_var = var(MEAN_DIFFERENCE[DOMAIN == "Values"], na.rm = TRUE), - life_var = var(MEAN_DIFFERENCE[DOMAIN == "Life"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(pref_var, pers_var, val_var, life_var) / min(pref_var, pers_var, val_var, life_var), - .groups = 'drop' - ) %>% - select(TIME, pref_var, pers_var, val_var, life_var, f_max_ratio) - -print(observed_domain_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Resample from actual data to create equal variance scenario - pooled_data <- data[[response_var]] - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Resample from pooled data for each group - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - # Bootstrap sample from pooled data - bootstrap_sample <- sample(pooled_data, size = n_group, replace = TRUE) - sample_var <- var(bootstrap_sample, na.rm = TRUE) - # Debug: Check for zero variance or NA - if(is.na(sample_var) || sample_var == 0) { - cat("Warning: Zero or NA variance detected in bootstrap sample\n") - sample_var <- 1e-10 - } - sample_var - }) - hartley_ratio <- calculate_hartley_ratio(sample_vars) - # Debug: Check for invalid ratios - if(!is.finite(hartley_ratio)) { - cat("Warning: Invalid Hartley ratio:", hartley_ratio, "Sample vars:", sample_vars, "\n") - } - hartley_ratio - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TIME, MEAN_DIFFERENCE), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(DOMAIN, MEAN_DIFFERENCE), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912160655.r b/.history/eohi1/mixed anova - domain means_20250912160655.r deleted file mode 100644 index d7bf658..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912160655.r +++ /dev/null @@ -1,438 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS -# ============================================================================= - -# Test 1: Observed F-max across TIME within each DOMAIN -print("=== OBSERVED F-MAX RATIOS: TIME within each DOMAIN ===") - -observed_time_ratios <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - # Calculate variances for each TIME level within this domain - past_var = var(MEAN_DIFFERENCE[TIME == "Past"], na.rm = TRUE), - future_var = var(MEAN_DIFFERENCE[TIME == "Future"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, future_var) / min(past_var, future_var), - .groups = 'drop' - ) %>% - select(DOMAIN, past_var, future_var, f_max_ratio) - -print(observed_time_ratios) - -# Test 2: Observed F-max across DOMAIN within each TIME -print("\n=== OBSERVED F-MAX RATIOS: DOMAIN within each TIME ===") - -observed_domain_ratios <- long_data_clean %>% - group_by(TIME) %>% - summarise( - # Calculate variances for each DOMAIN level within this time - pref_var = var(MEAN_DIFFERENCE[DOMAIN == "Preferences"], na.rm = TRUE), - pers_var = var(MEAN_DIFFERENCE[DOMAIN == "Personality"], na.rm = TRUE), - val_var = var(MEAN_DIFFERENCE[DOMAIN == "Values"], na.rm = TRUE), - life_var = var(MEAN_DIFFERENCE[DOMAIN == "Life"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(pref_var, pers_var, val_var, life_var) / min(pref_var, pers_var, val_var, life_var), - .groups = 'drop' - ) %>% - select(TIME, pref_var, pers_var, val_var, life_var, f_max_ratio) - -print(observed_domain_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Resample from actual data to create equal variance scenario - pooled_data <- data[[response_var]] - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Resample from pooled data for each group - sample_vars <- map_dbl(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - n_group <- length(group_data) - # Bootstrap sample from pooled data - bootstrap_sample <- sample(pooled_data, size = n_group, replace = TRUE) - sample_var <- var(bootstrap_sample, na.rm = TRUE) - # Debug: Check for zero variance or NA - if(is.na(sample_var) || sample_var == 0) { - cat("Warning: Zero or NA variance detected in bootstrap sample\n") - sample_var <- 1e-10 - } - sample_var - }) - hartley_ratio <- calculate_hartley_ratio(sample_vars) - # Debug: Check for invalid ratios - if(!is.finite(hartley_ratio)) { - cat("Warning: Invalid Hartley ratio:", hartley_ratio, "Sample vars:", sample_vars, "\n") - } - hartley_ratio - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TIME, MEAN_DIFFERENCE), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(DOMAIN, MEAN_DIFFERENCE), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912161036.r b/.history/eohi1/mixed anova - domain means_20250912161036.r deleted file mode 100644 index ecaa427..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912161036.r +++ /dev/null @@ -1,427 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS -# ============================================================================= - -# Test 1: Observed F-max across TIME within each DOMAIN -print("=== OBSERVED F-MAX RATIOS: TIME within each DOMAIN ===") - -observed_time_ratios <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - # Calculate variances for each TIME level within this domain - past_var = var(MEAN_DIFFERENCE[TIME == "Past"], na.rm = TRUE), - future_var = var(MEAN_DIFFERENCE[TIME == "Future"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, future_var) / min(past_var, future_var), - .groups = 'drop' - ) %>% - select(DOMAIN, past_var, future_var, f_max_ratio) - -print(observed_time_ratios) - -# Test 2: Observed F-max across DOMAIN within each TIME -print("\n=== OBSERVED F-MAX RATIOS: DOMAIN within each TIME ===") - -observed_domain_ratios <- long_data_clean %>% - group_by(TIME) %>% - summarise( - # Calculate variances for each DOMAIN level within this time - pref_var = var(MEAN_DIFFERENCE[DOMAIN == "Preferences"], na.rm = TRUE), - pers_var = var(MEAN_DIFFERENCE[DOMAIN == "Personality"], na.rm = TRUE), - val_var = var(MEAN_DIFFERENCE[DOMAIN == "Values"], na.rm = TRUE), - life_var = var(MEAN_DIFFERENCE[DOMAIN == "Life"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(pref_var, pers_var, val_var, life_var) / min(pref_var, pers_var, val_var, life_var), - .groups = 'drop' - ) %>% - select(TIME, pref_var, pers_var, val_var, life_var, f_max_ratio) - -print(observed_domain_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TIME, MEAN_DIFFERENCE), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(DOMAIN, MEAN_DIFFERENCE), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912161041.r b/.history/eohi1/mixed anova - domain means_20250912161041.r deleted file mode 100644 index ecaa427..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912161041.r +++ /dev/null @@ -1,427 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS -# ============================================================================= - -# Test 1: Observed F-max across TIME within each DOMAIN -print("=== OBSERVED F-MAX RATIOS: TIME within each DOMAIN ===") - -observed_time_ratios <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - # Calculate variances for each TIME level within this domain - past_var = var(MEAN_DIFFERENCE[TIME == "Past"], na.rm = TRUE), - future_var = var(MEAN_DIFFERENCE[TIME == "Future"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, future_var) / min(past_var, future_var), - .groups = 'drop' - ) %>% - select(DOMAIN, past_var, future_var, f_max_ratio) - -print(observed_time_ratios) - -# Test 2: Observed F-max across DOMAIN within each TIME -print("\n=== OBSERVED F-MAX RATIOS: DOMAIN within each TIME ===") - -observed_domain_ratios <- long_data_clean %>% - group_by(TIME) %>% - summarise( - # Calculate variances for each DOMAIN level within this time - pref_var = var(MEAN_DIFFERENCE[DOMAIN == "Preferences"], na.rm = TRUE), - pers_var = var(MEAN_DIFFERENCE[DOMAIN == "Personality"], na.rm = TRUE), - val_var = var(MEAN_DIFFERENCE[DOMAIN == "Values"], na.rm = TRUE), - life_var = var(MEAN_DIFFERENCE[DOMAIN == "Life"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(pref_var, pers_var, val_var, life_var) / min(pref_var, pers_var, val_var, life_var), - .groups = 'drop' - ) %>% - select(TIME, pref_var, pers_var, val_var, life_var, f_max_ratio) - -print(observed_domain_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TIME, MEAN_DIFFERENCE), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(DOMAIN, MEAN_DIFFERENCE), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912161046.r b/.history/eohi1/mixed anova - domain means_20250912161046.r deleted file mode 100644 index ecaa427..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912161046.r +++ /dev/null @@ -1,427 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS -# ============================================================================= - -# Test 1: Observed F-max across TIME within each DOMAIN -print("=== OBSERVED F-MAX RATIOS: TIME within each DOMAIN ===") - -observed_time_ratios <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - # Calculate variances for each TIME level within this domain - past_var = var(MEAN_DIFFERENCE[TIME == "Past"], na.rm = TRUE), - future_var = var(MEAN_DIFFERENCE[TIME == "Future"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, future_var) / min(past_var, future_var), - .groups = 'drop' - ) %>% - select(DOMAIN, past_var, future_var, f_max_ratio) - -print(observed_time_ratios) - -# Test 2: Observed F-max across DOMAIN within each TIME -print("\n=== OBSERVED F-MAX RATIOS: DOMAIN within each TIME ===") - -observed_domain_ratios <- long_data_clean %>% - group_by(TIME) %>% - summarise( - # Calculate variances for each DOMAIN level within this time - pref_var = var(MEAN_DIFFERENCE[DOMAIN == "Preferences"], na.rm = TRUE), - pers_var = var(MEAN_DIFFERENCE[DOMAIN == "Personality"], na.rm = TRUE), - val_var = var(MEAN_DIFFERENCE[DOMAIN == "Values"], na.rm = TRUE), - life_var = var(MEAN_DIFFERENCE[DOMAIN == "Life"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(pref_var, pers_var, val_var, life_var) / min(pref_var, pers_var, val_var, life_var), - .groups = 'drop' - ) %>% - select(TIME, pref_var, pers_var, val_var, life_var, f_max_ratio) - -print(observed_domain_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TIME, MEAN_DIFFERENCE), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(DOMAIN, MEAN_DIFFERENCE), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912161558.r b/.history/eohi1/mixed anova - domain means_20250912161558.r deleted file mode 100644 index ebe4e46..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912161558.r +++ /dev/null @@ -1,410 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - temporal_1_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == 1], na.rm = TRUE), - temporal_2_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == 2], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(temporal_1_var, temporal_2_var) / min(temporal_1_var, temporal_2_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, temporal_1_var, temporal_2_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Test 1: Hartley's F-max across TIME within each DOMAIN -print("\n=== HARTLEY'S F-MAX TEST: TIME within each DOMAIN ===") -set.seed(123) # For reproducibility - -hartley_time_results <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TIME, MEAN_DIFFERENCE), "TIME", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_time_results) - -# Test 2: Hartley's F-max across DOMAIN within each TIME -print("\n=== HARTLEY'S F-MAX TEST: DOMAIN within each TIME ===") - -hartley_domain_results <- long_data_clean %>% - group_by(TIME) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(DOMAIN, MEAN_DIFFERENCE), "DOMAIN", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, observed_ratio, critical_95, significant) - -print(hartley_domain_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912161610.r b/.history/eohi1/mixed anova - domain means_20250912161610.r deleted file mode 100644 index 9b93e4f..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912161610.r +++ /dev/null @@ -1,392 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - temporal_1_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == 1], na.rm = TRUE), - temporal_2_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == 2], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(temporal_1_var, temporal_2_var) / min(temporal_1_var, temporal_2_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, temporal_1_var, temporal_2_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912161617.r b/.history/eohi1/mixed anova - domain means_20250912161617.r deleted file mode 100644 index 9b93e4f..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912161617.r +++ /dev/null @@ -1,392 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - temporal_1_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == 1], na.rm = TRUE), - temporal_2_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == 2], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(temporal_1_var, temporal_2_var) / min(temporal_1_var, temporal_2_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, temporal_1_var, temporal_2_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912161619.r b/.history/eohi1/mixed anova - domain means_20250912161619.r deleted file mode 100644 index 9b93e4f..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912161619.r +++ /dev/null @@ -1,392 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print("=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - temporal_1_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == 1], na.rm = TRUE), - temporal_2_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == 2], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(temporal_1_var, temporal_2_var) / min(temporal_1_var, temporal_2_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, temporal_1_var, temporal_2_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912161932.r b/.history/eohi1/mixed anova - domain means_20250912161932.r deleted file mode 100644 index f1106bd..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912161932.r +++ /dev/null @@ -1,403 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -# Get the actual TEMPORAL_DO levels -temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO)) -print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", "))) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - temporal_var_1 = var(MEAN_DIFFERENCE[TEMPORAL_DO == temporal_levels[1]], na.rm = TRUE), - temporal_var_2 = var(MEAN_DIFFERENCE[TEMPORAL_DO == temporal_levels[2]], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(temporal_var_1, temporal_var_2) / min(temporal_var_1, temporal_var_2), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, temporal_var_1, temporal_var_2, f_max_ratio) - -print(observed_temporal_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912161937.r b/.history/eohi1/mixed anova - domain means_20250912161937.r deleted file mode 100644 index f1106bd..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912161937.r +++ /dev/null @@ -1,403 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -# Get the actual TEMPORAL_DO levels -temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO)) -print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", "))) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - temporal_var_1 = var(MEAN_DIFFERENCE[TEMPORAL_DO == temporal_levels[1]], na.rm = TRUE), - temporal_var_2 = var(MEAN_DIFFERENCE[TEMPORAL_DO == temporal_levels[2]], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(temporal_var_1, temporal_var_2) / min(temporal_var_1, temporal_var_2), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, temporal_var_1, temporal_var_2, f_max_ratio) - -print(observed_temporal_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912162001.r b/.history/eohi1/mixed anova - domain means_20250912162001.r deleted file mode 100644 index 2ab7f7c..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912162001.r +++ /dev/null @@ -1,403 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -# Get the actual TEMPORAL_DO levels -temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO)) -print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", "))) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912162009.r b/.history/eohi1/mixed anova - domain means_20250912162009.r deleted file mode 100644 index 2ab7f7c..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912162009.r +++ /dev/null @@ -1,403 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -# Get the actual TEMPORAL_DO levels -temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO)) -print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", "))) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912162037.r b/.history/eohi1/mixed anova - domain means_20250912162037.r deleted file mode 100644 index 2ab7f7c..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912162037.r +++ /dev/null @@ -1,403 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -# Get the actual TEMPORAL_DO levels -temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO)) -print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", "))) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912162116.r b/.history/eohi1/mixed anova - domain means_20250912162116.r deleted file mode 100644 index 2ab7f7c..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912162116.r +++ /dev/null @@ -1,403 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -# Get the actual TEMPORAL_DO levels -temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO)) -print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", "))) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - diff --git a/.history/eohi1/mixed anova - domain means_20250912162139.r b/.history/eohi1/mixed anova - domain means_20250912162139.r deleted file mode 100644 index 92adae3..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912162139.r +++ /dev/null @@ -1,479 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -# Get the actual TEMPORAL_DO levels -temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO)) -print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", "))) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -print("\n=== MIXED ANOVA: TEMPORAL_DO (between) × TIME × DOMAIN (within) ===") - -# Mixed ANOVA using ezANOVA -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_results <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - return_aov = TRUE -) - -print("Mixed ANOVA Results:") -print(mixed_anova_results$ANOVA) - -# Extract effect sizes (partial eta squared) -effect_sizes <- mixed_anova_results$ANOVA %>% - mutate( - partial_eta_squared = round(SSn / (SSn + SSd), 5), - .after = p - ) - -print("\nEffect Sizes (Partial Eta Squared):") -print(effect_sizes %>% select(Effect, F, p, partial_eta_squared)) - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_results$aov, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_results$aov, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_results$aov, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250912162145.r b/.history/eohi1/mixed anova - domain means_20250912162145.r deleted file mode 100644 index 92adae3..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912162145.r +++ /dev/null @@ -1,479 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -# Get the actual TEMPORAL_DO levels -temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO)) -print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", "))) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -print("\n=== MIXED ANOVA: TEMPORAL_DO (between) × TIME × DOMAIN (within) ===") - -# Mixed ANOVA using ezANOVA -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_results <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - return_aov = TRUE -) - -print("Mixed ANOVA Results:") -print(mixed_anova_results$ANOVA) - -# Extract effect sizes (partial eta squared) -effect_sizes <- mixed_anova_results$ANOVA %>% - mutate( - partial_eta_squared = round(SSn / (SSn + SSd), 5), - .after = p - ) - -print("\nEffect Sizes (Partial Eta Squared):") -print(effect_sizes %>% select(Effect, F, p, partial_eta_squared)) - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_results$aov, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_results$aov, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_results$aov, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250912162151.r b/.history/eohi1/mixed anova - domain means_20250912162151.r deleted file mode 100644 index 92adae3..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912162151.r +++ /dev/null @@ -1,479 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -# Get the actual TEMPORAL_DO levels -temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO)) -print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", "))) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -print("\n=== MIXED ANOVA: TEMPORAL_DO (between) × TIME × DOMAIN (within) ===") - -# Mixed ANOVA using ezANOVA -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_results <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - return_aov = TRUE -) - -print("Mixed ANOVA Results:") -print(mixed_anova_results$ANOVA) - -# Extract effect sizes (partial eta squared) -effect_sizes <- mixed_anova_results$ANOVA %>% - mutate( - partial_eta_squared = round(SSn / (SSn + SSd), 5), - .after = p - ) - -print("\nEffect Sizes (Partial Eta Squared):") -print(effect_sizes %>% select(Effect, F, p, partial_eta_squared)) - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_results$aov, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_results$aov, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_results$aov, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250912162247.r b/.history/eohi1/mixed anova - domain means_20250912162247.r deleted file mode 100644 index 986e63b..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912162247.r +++ /dev/null @@ -1,484 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -# Get the actual TEMPORAL_DO levels -temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO)) -print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", "))) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using ezANOVA -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_results <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - return_aov = TRUE -) - -print("Mixed ANOVA Results:") -print(mixed_anova_results$ANOVA) - -# Extract effect sizes (generalized eta squared) -# Generalized eta squared = SS_effect / (SS_effect + SS_error + SS_subjects) -# For mixed designs, this provides consistent effect size estimates - -# Calculate generalized eta squared -total_ss <- sum(mixed_anova_results$ANOVA$SSn, na.rm = TRUE) -ss_error <- sum(mixed_anova_results$ANOVA$SSd, na.rm = TRUE) - -effect_sizes <- mixed_anova_results$ANOVA %>% - mutate( - generalized_eta_squared = round(SSn / (SSn + SSd), 5), - .after = p - ) - -print("\nEffect Sizes (Generalized Eta Squared):") -print(effect_sizes %>% select(Effect, F, p, generalized_eta_squared)) - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_results$aov, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_results$aov, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_results$aov, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250912162250.r b/.history/eohi1/mixed anova - domain means_20250912162250.r deleted file mode 100644 index 986e63b..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912162250.r +++ /dev/null @@ -1,484 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -# Get the actual TEMPORAL_DO levels -temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO)) -print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", "))) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using ezANOVA -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_results <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - return_aov = TRUE -) - -print("Mixed ANOVA Results:") -print(mixed_anova_results$ANOVA) - -# Extract effect sizes (generalized eta squared) -# Generalized eta squared = SS_effect / (SS_effect + SS_error + SS_subjects) -# For mixed designs, this provides consistent effect size estimates - -# Calculate generalized eta squared -total_ss <- sum(mixed_anova_results$ANOVA$SSn, na.rm = TRUE) -ss_error <- sum(mixed_anova_results$ANOVA$SSd, na.rm = TRUE) - -effect_sizes <- mixed_anova_results$ANOVA %>% - mutate( - generalized_eta_squared = round(SSn / (SSn + SSd), 5), - .after = p - ) - -print("\nEffect Sizes (Generalized Eta Squared):") -print(effect_sizes %>% select(Effect, F, p, generalized_eta_squared)) - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_results$aov, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_results$aov, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_results$aov, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250912162255.r b/.history/eohi1/mixed anova - domain means_20250912162255.r deleted file mode 100644 index 986e63b..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912162255.r +++ /dev/null @@ -1,484 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -# Get the actual TEMPORAL_DO levels -temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO)) -print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", "))) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using ezANOVA -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_results <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - return_aov = TRUE -) - -print("Mixed ANOVA Results:") -print(mixed_anova_results$ANOVA) - -# Extract effect sizes (generalized eta squared) -# Generalized eta squared = SS_effect / (SS_effect + SS_error + SS_subjects) -# For mixed designs, this provides consistent effect size estimates - -# Calculate generalized eta squared -total_ss <- sum(mixed_anova_results$ANOVA$SSn, na.rm = TRUE) -ss_error <- sum(mixed_anova_results$ANOVA$SSd, na.rm = TRUE) - -effect_sizes <- mixed_anova_results$ANOVA %>% - mutate( - generalized_eta_squared = round(SSn / (SSn + SSd), 5), - .after = p - ) - -print("\nEffect Sizes (Generalized Eta Squared):") -print(effect_sizes %>% select(Effect, F, p, generalized_eta_squared)) - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_results$aov, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_results$aov, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_results$aov, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250912162832.r b/.history/eohi1/mixed anova - domain means_20250912162832.r deleted file mode 100644 index a287515..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912162832.r +++ /dev/null @@ -1,477 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -# Get the actual TEMPORAL_DO levels -temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO)) -print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", "))) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# Generalized eta squared = SS_effect / (SS_effect + SS_error + SS_subjects) -# For mixed designs, this provides consistent effect size estimates - -# Calculate generalized eta squared -total_ss <- sum(mixed_anova_results$ANOVA$SSn, na.rm = TRUE) -ss_error <- sum(mixed_anova_results$ANOVA$SSd, na.rm = TRUE) - -effect_sizes <- mixed_anova_results$ANOVA %>% - mutate( - generalized_eta_squared = round(SSn / (SSn + SSd), 5), - .after = p - ) - -print("\nEffect Sizes (Generalized Eta Squared):") -print(effect_sizes %>% select(Effect, F, p, generalized_eta_squared)) - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_results$aov, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_results$aov, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_results$aov, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250912162837.r b/.history/eohi1/mixed anova - domain means_20250912162837.r deleted file mode 100644 index d7dea47..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912162837.r +++ /dev/null @@ -1,467 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -# Get the actual TEMPORAL_DO levels -temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO)) -print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", "))) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -print("\nEffect Sizes (Generalized Eta Squared):") -print("Note: Effect sizes will be calculated from the ANOVA summary") - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_results$aov, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_results$aov, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_results$aov, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_results$aov, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250912162851.r b/.history/eohi1/mixed anova - domain means_20250912162851.r deleted file mode 100644 index 4b3b99e..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912162851.r +++ /dev/null @@ -1,467 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -# Get the actual TEMPORAL_DO levels -temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO)) -print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", "))) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -print("\nEffect Sizes (Generalized Eta Squared):") -print("Note: Effect sizes will be calculated from the ANOVA summary") - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250912162858.r b/.history/eohi1/mixed anova - domain means_20250912162858.r deleted file mode 100644 index 4b3b99e..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912162858.r +++ /dev/null @@ -1,467 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -# Get the actual TEMPORAL_DO levels -temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO)) -print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", "))) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -print("\nEffect Sizes (Generalized Eta Squared):") -print("Note: Effect sizes will be calculated from the ANOVA summary") - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250912163014.r b/.history/eohi1/mixed anova - domain means_20250912163014.r deleted file mode 100644 index 4b3b99e..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912163014.r +++ /dev/null @@ -1,467 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -# Get the actual TEMPORAL_DO levels -temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO)) -print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", "))) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -print("\nEffect Sizes (Generalized Eta Squared):") -print("Note: Effect sizes will be calculated from the ANOVA summary") - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250912164147.r b/.history/eohi1/mixed anova - domain means_20250912164147.r deleted file mode 100644 index 4faa42e..0000000 --- a/.history/eohi1/mixed anova - domain means_20250912164147.r +++ /dev/null @@ -1,471 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -# Get the actual TEMPORAL_DO levels -temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO)) -print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", "))) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -print("\nEffect Sizes (Generalized Eta Squared):") -print("Note: Effect sizes will be calculated from the ANOVA summary") - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915110342.r b/.history/eohi1/mixed anova - domain means_20250915110342.r deleted file mode 100644 index 635147d..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915110342.r +++ /dev/null @@ -1,460 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# More efficient data pivoting using tidyr -pivot_domain_means <- function(data, domain_mapping) { - # Use pivot_longer for efficient reshaping - long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(domain_mapping$variable)) %>% - pivot_longer( - cols = all_of(domain_mapping$variable), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(-variable) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_W = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$statistic, - NA), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_A = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$statistic, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) - -print("Normality test results:") -print(normality_results) - -# Note: Anderson-Darling p-values may appear identical due to machine precision limits -# All p-values are extremely small (< 2.2e-16) indicating strong non-normality -# The test statistics (A values) are actually different across conditions - -# Debug: Check if Anderson-Darling test is working properly -print("\n=== DEBUG: Anderson-Darling Test Details ===") - -# Get unique combinations of TIME and DOMAIN -unique_combos <- long_data_clean %>% - select(TIME, DOMAIN) %>% - distinct() - -# Run Anderson-Darling test for each combination -for(i in 1:nrow(unique_combos)) { - time_val <- unique_combos$TIME[i] - domain_val <- unique_combos$DOMAIN[i] - - # Subset data for this combination - subset_data <- long_data_clean %>% - filter(TIME == time_val, DOMAIN == domain_val) - - cat("TIME:", time_val, "DOMAIN:", domain_val, "n =", nrow(subset_data), "\n") - - # Run Anderson-Darling test - if(nrow(subset_data) >= 7) { - ad_result <- ad.test(subset_data$MEAN_DIFFERENCE) - print(ad_result) - } else { - cat("Sample size too small for Anderson-Darling test\n") - } - cat("\n") -} - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -# Get the actual TEMPORAL_DO levels -temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO)) -print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", "))) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -print("\nEffect Sizes (Generalized Eta Squared):") -print("Note: Effect sizes will be calculated from the ANOVA summary") - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915110402.r b/.history/eohi1/mixed anova - domain means_20250915110402.r deleted file mode 100644 index 24f5e51..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915110402.r +++ /dev/null @@ -1,417 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# More efficient data pivoting using tidyr -pivot_domain_means <- function(data, domain_mapping) { - # Use pivot_longer for efficient reshaping - long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(domain_mapping$variable)) %>% - pivot_longer( - cols = all_of(domain_mapping$variable), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(-variable) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -print(round(normality_results, 5)) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -# Get the actual TEMPORAL_DO levels -temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO)) -print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", "))) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Function to bootstrap critical values for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Debug: Check for zero or invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - cat("Warning: Invalid observed variances detected:", observed_vars, "\n") - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- calculate_hartley_ratio(observed_vars) - - # Bootstrap under null hypothesis (equal variances) - # Bootstrap from each group independently to create natural variation - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] # Remove any NA values - }) - - # Bootstrap F-max ratios under null hypothesis - bootstrap_ratios <- replicate(n_iter, { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - calculate_hartley_ratio(sample_vars) - }) - - # Calculate critical value (95th percentile) - # Remove any NaN or infinite values - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - # Debug: Check what's happening - cat("Total bootstrap ratios:", length(bootstrap_ratios), "\n") - cat("Valid bootstrap ratios:", length(valid_ratios), "\n") - cat("Sample of bootstrap ratios:", head(bootstrap_ratios, 5), "\n") - cat("Sample of valid ratios:", head(valid_ratios, 5), "\n") - - if(length(valid_ratios) == 0) { - cat("All bootstrap ratios were invalid. Sample ratios:", head(bootstrap_ratios, 10), "\n") - stop("No valid bootstrap ratios generated") - } - - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - bootstrap_ratios = bootstrap_ratios - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -print("\nEffect Sizes (Generalized Eta Squared):") -print("Note: Effect sizes will be calculated from the ANOVA summary") - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915110435.r b/.history/eohi1/mixed anova - domain means_20250915110435.r deleted file mode 100644 index f3d3f29..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915110435.r +++ /dev/null @@ -1,412 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# More efficient data pivoting using tidyr -pivot_domain_means <- function(data, domain_mapping) { - # Use pivot_longer for efficient reshaping - long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(domain_mapping$variable)) %>% - pivot_longer( - cols = all_of(domain_mapping$variable), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(-variable) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -print(round(normality_results, 5)) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -# Get the actual TEMPORAL_DO levels -temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO)) -print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", "))) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -print("\nEffect Sizes (Generalized Eta Squared):") -print("Note: Effect sizes will be calculated from the ANOVA summary") - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915110448.r b/.history/eohi1/mixed anova - domain means_20250915110448.r deleted file mode 100644 index 059442a..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915110448.r +++ /dev/null @@ -1,405 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# More efficient data pivoting using tidyr -pivot_domain_means <- function(data, domain_mapping) { - # Use pivot_longer for efficient reshaping - long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(domain_mapping$variable)) %>% - pivot_longer( - cols = all_of(domain_mapping$variable), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(-variable) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -print(round(normality_results, 5)) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -# Get the actual TEMPORAL_DO levels -temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO)) -print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", "))) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -print("\nEffect Sizes (Generalized Eta Squared):") -print("Note: Effect sizes will be calculated from the ANOVA summary") - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915110457.r b/.history/eohi1/mixed anova - domain means_20250915110457.r deleted file mode 100644 index 1fc775c..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915110457.r +++ /dev/null @@ -1,405 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting using tidyr -pivot_domain_means <- function(data, domain_mapping) { - # Use pivot_longer for efficient reshaping - long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(domain_mapping$variable)) %>% - pivot_longer( - cols = all_of(domain_mapping$variable), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(-variable) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -print(round(normality_results, 5)) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -# Get the actual TEMPORAL_DO levels -temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO)) -print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", "))) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -print("\nEffect Sizes (Generalized Eta Squared):") -print("Note: Effect sizes will be calculated from the ANOVA summary") - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915110504.r b/.history/eohi1/mixed anova - domain means_20250915110504.r deleted file mode 100644 index 990fdcc..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915110504.r +++ /dev/null @@ -1,385 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting using tidyr -pivot_domain_means <- function(data, domain_mapping) { - # Use pivot_longer for efficient reshaping - long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(domain_mapping$variable)) %>% - pivot_longer( - cols = all_of(domain_mapping$variable), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(-variable) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -print(round(normality_results, 5)) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -# Get the actual TEMPORAL_DO levels -temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO)) -print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", "))) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -print("\nEffect Sizes (Generalized Eta Squared):") -print("Note: Effect sizes will be calculated from the ANOVA summary") - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915110508.r b/.history/eohi1/mixed anova - domain means_20250915110508.r deleted file mode 100644 index b5d40f5..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915110508.r +++ /dev/null @@ -1,385 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting using tidyr -pivot_domain_means <- function(data, domain_mapping) { - # Use pivot_longer for efficient reshaping - long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(domain_mapping$variable)) %>% - pivot_longer( - cols = all_of(domain_mapping$variable), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(-variable) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -print(round(normality_results, 5)) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -# Get the actual TEMPORAL_DO levels -temporal_levels <- sort(unique(long_data_clean$TEMPORAL_DO)) -print(paste("TEMPORAL_DO levels:", paste(temporal_levels, collapse = ", "))) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -print("\nEffect Sizes (Generalized Eta Squared):") -print("Note: Effect sizes will be calculated from the ANOVA summary") - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915110512.r b/.history/eohi1/mixed anova - domain means_20250915110512.r deleted file mode 100644 index 10a03bb..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915110512.r +++ /dev/null @@ -1,381 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting using tidyr -pivot_domain_means <- function(data, domain_mapping) { - # Use pivot_longer for efficient reshaping - long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(domain_mapping$variable)) %>% - pivot_longer( - cols = all_of(domain_mapping$variable), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(-variable) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -print(round(normality_results, 5)) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN combination ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -print("\nEffect Sizes (Generalized Eta Squared):") -print("Note: Effect sizes will be calculated from the ANOVA summary") - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915110518.r b/.history/eohi1/mixed anova - domain means_20250915110518.r deleted file mode 100644 index ec24da1..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915110518.r +++ /dev/null @@ -1,381 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting using tidyr -pivot_domain_means <- function(data, domain_mapping) { - # Use pivot_longer for efficient reshaping - long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(domain_mapping$variable)) %>% - pivot_longer( - cols = all_of(domain_mapping$variable), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(-variable) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -print(round(normality_results, 5)) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -print("\nEffect Sizes (Generalized Eta Squared):") -print("Note: Effect sizes will be calculated from the ANOVA summary") - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915110521.r b/.history/eohi1/mixed anova - domain means_20250915110521.r deleted file mode 100644 index b6661b8..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915110521.r +++ /dev/null @@ -1,380 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting using tidyr -pivot_domain_means <- function(data, domain_mapping) { - # Use pivot_longer for efficient reshaping - long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(domain_mapping$variable)) %>% - pivot_longer( - cols = all_of(domain_mapping$variable), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(-variable) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -print(round(normality_results, 5)) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915110535.r b/.history/eohi1/mixed anova - domain means_20250915110535.r deleted file mode 100644 index 41072de..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915110535.r +++ /dev/null @@ -1,381 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting using tidyr -pivot_domain_means <- function(data, domain_mapping) { - # Use pivot_longer for efficient reshaping - long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(domain_mapping$variable)) %>% - pivot_longer( - cols = all_of(domain_mapping$variable), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(-variable) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -print(round(normality_results, 5)) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - n_groups <- length(groups) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915110539.r b/.history/eohi1/mixed anova - domain means_20250915110539.r deleted file mode 100644 index b70cdbd..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915110539.r +++ /dev/null @@ -1,380 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting using tidyr -pivot_domain_means <- function(data, domain_mapping) { - # Use pivot_longer for efficient reshaping - long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(domain_mapping$variable)) %>% - pivot_longer( - cols = all_of(domain_mapping$variable), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(-variable) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -print(round(normality_results, 5)) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915110547.r b/.history/eohi1/mixed anova - domain means_20250915110547.r deleted file mode 100644 index b70cdbd..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915110547.r +++ /dev/null @@ -1,380 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting using tidyr -pivot_domain_means <- function(data, domain_mapping) { - # Use pivot_longer for efficient reshaping - long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(domain_mapping$variable)) %>% - pivot_longer( - cols = all_of(domain_mapping$variable), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(-variable) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -print(round(normality_results, 5)) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915111101.r b/.history/eohi1/mixed anova - domain means_20250915111101.r deleted file mode 100644 index 3104b92..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915111101.r +++ /dev/null @@ -1,383 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting using tidyr -pivot_domain_means <- function(data, domain_mapping) { - # Use pivot_longer for efficient reshaping - long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(domain_mapping$variable)) %>% - pivot_longer( - cols = all_of(domain_mapping$variable), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(-variable) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -print(round(normality_results, 5)) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915111110.r b/.history/eohi1/mixed anova - domain means_20250915111110.r deleted file mode 100644 index 3104b92..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915111110.r +++ /dev/null @@ -1,383 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting using tidyr -pivot_domain_means <- function(data, domain_mapping) { - # Use pivot_longer for efficient reshaping - long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(domain_mapping$variable)) %>% - pivot_longer( - cols = all_of(domain_mapping$variable), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(-variable) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -print(round(normality_results, 5)) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915111114.r b/.history/eohi1/mixed anova - domain means_20250915111114.r deleted file mode 100644 index 3104b92..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915111114.r +++ /dev/null @@ -1,383 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting using tidyr -pivot_domain_means <- function(data, domain_mapping) { - # Use pivot_longer for efficient reshaping - long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(domain_mapping$variable)) %>% - pivot_longer( - cols = all_of(domain_mapping$variable), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(-variable) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -print(round(normality_results, 5)) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915111252.r b/.history/eohi1/mixed anova - domain means_20250915111252.r deleted file mode 100644 index d39dd78..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915111252.r +++ /dev/null @@ -1,395 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting using tidyr -pivot_domain_means <- function(data, domain_mapping) { - # Create long data frame more efficiently - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -print(round(normality_results, 5)) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915111308.r b/.history/eohi1/mixed anova - domain means_20250915111308.r deleted file mode 100644 index 8e19166..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915111308.r +++ /dev/null @@ -1,409 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -print(round(normality_results, 5)) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915111314.r b/.history/eohi1/mixed anova - domain means_20250915111314.r deleted file mode 100644 index 8e19166..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915111314.r +++ /dev/null @@ -1,409 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -print(round(normality_results, 5)) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915112435.r b/.history/eohi1/mixed anova - domain means_20250915112435.r deleted file mode 100644 index 8e19166..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915112435.r +++ /dev/null @@ -1,409 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -print(round(normality_results, 5)) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915112521.r b/.history/eohi1/mixed anova - domain means_20250915112521.r deleted file mode 100644 index ca257a2..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915112521.r +++ /dev/null @@ -1,409 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -print(round(normality_results, 5)) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915112528.r b/.history/eohi1/mixed anova - domain means_20250915112528.r deleted file mode 100644 index ca257a2..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915112528.r +++ /dev/null @@ -1,409 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -print(round(normality_results, 5)) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915112533.r b/.history/eohi1/mixed anova - domain means_20250915112533.r deleted file mode 100644 index ca257a2..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915112533.r +++ /dev/null @@ -1,409 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -print(round(normality_results, 5)) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915112607.r b/.history/eohi1/mixed anova - domain means_20250915112607.r deleted file mode 100644 index 99233f6..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915112607.r +++ /dev/null @@ -1,412 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915112612.r b/.history/eohi1/mixed anova - domain means_20250915112612.r deleted file mode 100644 index 99233f6..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915112612.r +++ /dev/null @@ -1,412 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915112617.r b/.history/eohi1/mixed anova - domain means_20250915112617.r deleted file mode 100644 index 99233f6..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915112617.r +++ /dev/null @@ -1,412 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915113543.r b/.history/eohi1/mixed anova - domain means_20250915113543.r deleted file mode 100644 index ee2fa41..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915113543.r +++ /dev/null @@ -1,447 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915113742.r b/.history/eohi1/mixed anova - domain means_20250915113742.r deleted file mode 100644 index fa3699f..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915113742.r +++ /dev/null @@ -1,477 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915113749.r b/.history/eohi1/mixed anova - domain means_20250915113749.r deleted file mode 100644 index fa3699f..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915113749.r +++ /dev/null @@ -1,477 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915114518.r b/.history/eohi1/mixed anova - domain means_20250915114518.r deleted file mode 100644 index fa3699f..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915114518.r +++ /dev/null @@ -1,477 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915114619.r b/.history/eohi1/mixed anova - domain means_20250915114619.r deleted file mode 100644 index b54e0f7..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915114619.r +++ /dev/null @@ -1,477 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -# mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# print("Mixed ANOVA Results:") -# print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915114727.r b/.history/eohi1/mixed anova - domain means_20250915114727.r deleted file mode 100644 index 737ad9c..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915114727.r +++ /dev/null @@ -1,477 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -# mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), -# data = long_data_clean) - -# print("Mixed ANOVA Results:") -# print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915114729.r b/.history/eohi1/mixed anova - domain means_20250915114729.r deleted file mode 100644 index 737ad9c..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915114729.r +++ /dev/null @@ -1,477 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -# mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), -# data = long_data_clean) - -# print("Mixed ANOVA Results:") -# print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915114731.r b/.history/eohi1/mixed anova - domain means_20250915114731.r deleted file mode 100644 index 737ad9c..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915114731.r +++ /dev/null @@ -1,477 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -# mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), -# data = long_data_clean) - -# print("Mixed ANOVA Results:") -# print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915114817.r b/.history/eohi1/mixed anova - domain means_20250915114817.r deleted file mode 100644 index 4f6be66..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915114817.r +++ /dev/null @@ -1,477 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -# mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), -# data = long_data_clean) - -# print("Mixed ANOVA Results:") -# print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -# anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915115032.r b/.history/eohi1/mixed anova - domain means_20250915115032.r deleted file mode 100644 index fa3699f..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915115032.r +++ /dev/null @@ -1,477 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915120001.r b/.history/eohi1/mixed anova - domain means_20250915120001.r deleted file mode 100644 index a6a73cc..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915120001.r +++ /dev/null @@ -1,544 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915120010.r b/.history/eohi1/mixed anova - domain means_20250915120010.r deleted file mode 100644 index a6a73cc..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915120010.r +++ /dev/null @@ -1,544 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915120029.r b/.history/eohi1/mixed anova - domain means_20250915120029.r deleted file mode 100644 index 928f4b9..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915120029.r +++ /dev/null @@ -1,612 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Load required library for sphericity tests -library(ez) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915120039.r b/.history/eohi1/mixed anova - domain means_20250915120039.r deleted file mode 100644 index 928f4b9..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915120039.r +++ /dev/null @@ -1,612 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Load required library for sphericity tests -library(ez) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915120050.r b/.history/eohi1/mixed anova - domain means_20250915120050.r deleted file mode 100644 index 928f4b9..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915120050.r +++ /dev/null @@ -1,612 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Load required library for sphericity tests -library(ez) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# Effect sizes will be calculated separately - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915120848.r b/.history/eohi1/mixed anova - domain means_20250915120848.r deleted file mode 100644 index 8dd7462..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915120848.r +++ /dev/null @@ -1,659 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Load required library for sphericity tests -library(ez) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Load effectsize package for proper effect size calculations -library(effectsize) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915120855.r b/.history/eohi1/mixed anova - domain means_20250915120855.r deleted file mode 100644 index 8dd7462..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915120855.r +++ /dev/null @@ -1,659 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Load required library for sphericity tests -library(ez) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Load effectsize package for proper effect size calculations -library(effectsize) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915120900.r b/.history/eohi1/mixed anova - domain means_20250915120900.r deleted file mode 100644 index 8dd7462..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915120900.r +++ /dev/null @@ -1,659 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Load required library for sphericity tests -library(ez) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Load effectsize package for proper effect size calculations -library(effectsize) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915121033.r b/.history/eohi1/mixed anova - domain means_20250915121033.r deleted file mode 100644 index c872f59..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915121033.r +++ /dev/null @@ -1,685 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Load required library for sphericity tests -library(ez) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print(ez_corrected$`Sphericity Corrections`) -} else { - print("No sphericity corrections available - checking if sphericity is met") -} - -# Alternative: Get detailed sphericity information -print("\nDetailed Sphericity Information:") -print("Mauchly's Test Results:") -print(ez_corrected$Mauchly) - -# Check if we need to manually apply corrections -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Load effectsize package for proper effect size calculations -library(effectsize) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915121043.r b/.history/eohi1/mixed anova - domain means_20250915121043.r deleted file mode 100644 index 35fe930..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915121043.r +++ /dev/null @@ -1,736 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Load required library for sphericity tests -library(ez) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print(ez_corrected$`Sphericity Corrections`) -} else { - print("No sphericity corrections available - checking if sphericity is met") -} - -# Alternative: Get detailed sphericity information -print("\nDetailed Sphericity Information:") -print("Mauchly's Test Results:") -print(ez_corrected$Mauchly) - -# Check if we need to manually apply corrections -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: MANUAL EPSILON CALCULATIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== MANUAL EPSILON CALCULATIONS ===") - -# Create a wide-format data for car package -library(car) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Load effectsize package for proper effect size calculations -library(effectsize) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915121049.r b/.history/eohi1/mixed anova - domain means_20250915121049.r deleted file mode 100644 index 35fe930..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915121049.r +++ /dev/null @@ -1,736 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Load required library for sphericity tests -library(ez) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print(ez_corrected$`Sphericity Corrections`) -} else { - print("No sphericity corrections available - checking if sphericity is met") -} - -# Alternative: Get detailed sphericity information -print("\nDetailed Sphericity Information:") -print("Mauchly's Test Results:") -print(ez_corrected$Mauchly) - -# Check if we need to manually apply corrections -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: MANUAL EPSILON CALCULATIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== MANUAL EPSILON CALCULATIONS ===") - -# Create a wide-format data for car package -library(car) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Load effectsize package for proper effect size calculations -library(effectsize) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915121051.r b/.history/eohi1/mixed anova - domain means_20250915121051.r deleted file mode 100644 index 35fe930..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915121051.r +++ /dev/null @@ -1,736 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Load required library for sphericity tests -library(ez) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print(ez_corrected$`Sphericity Corrections`) -} else { - print("No sphericity corrections available - checking if sphericity is met") -} - -# Alternative: Get detailed sphericity information -print("\nDetailed Sphericity Information:") -print("Mauchly's Test Results:") -print(ez_corrected$Mauchly) - -# Check if we need to manually apply corrections -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: MANUAL EPSILON CALCULATIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== MANUAL EPSILON CALCULATIONS ===") - -# Create a wide-format data for car package -library(car) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Load effectsize package for proper effect size calculations -library(effectsize) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915121136.r b/.history/eohi1/mixed anova - domain means_20250915121136.r deleted file mode 100644 index 4c43b96..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915121136.r +++ /dev/null @@ -1,745 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Load required library for sphericity tests -library(ez) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Try different ways to access sphericity corrections -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print(ez_corrected$`Sphericity Corrections`) -} else if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print(ez_corrected$`Sphericity Corrections`) -} else if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print(ez_corrected$`Sphericity Corrections`) -} else { - print("No sphericity corrections found in standard ezANOVA output") - print("This may be because ezANOVA doesn't always display corrections") -} - -# Alternative: Get detailed sphericity information -print("\nDetailed Sphericity Information:") -print("Mauchly's Test Results:") -print(ez_corrected$Mauchly) - -# Check if we need to manually apply corrections -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: MANUAL EPSILON CALCULATIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== MANUAL EPSILON CALCULATIONS ===") - -# Create a wide-format data for car package -library(car) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Load effectsize package for proper effect size calculations -library(effectsize) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915121141.r b/.history/eohi1/mixed anova - domain means_20250915121141.r deleted file mode 100644 index 2c498e6..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915121141.r +++ /dev/null @@ -1,745 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Load required library for sphericity tests -library(ez) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Try different ways to access sphericity corrections -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print(ez_corrected$`Sphericity Corrections`) -} else if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print(ez_corrected$`Sphericity Corrections`) -} else if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print(ez_corrected$`Sphericity Corrections`) -} else { - print("No sphericity corrections found in standard ezANOVA output") - print("This may be because ezANOVA doesn't always display corrections") -} - -# Alternative: Get detailed sphericity information -print("\nDetailed Sphericity Information:") -print("Mauchly's Test Results:") -print(ez_corrected$Mauchly) - -# Check if we need to manually apply corrections -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package -library(car) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Load effectsize package for proper effect size calculations -library(effectsize) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915121152.r b/.history/eohi1/mixed anova - domain means_20250915121152.r deleted file mode 100644 index 60fdf68..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915121152.r +++ /dev/null @@ -1,774 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Load required library for sphericity tests -library(ez) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Try different ways to access sphericity corrections -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print(ez_corrected$`Sphericity Corrections`) -} else if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print(ez_corrected$`Sphericity Corrections`) -} else if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print(ez_corrected$`Sphericity Corrections`) -} else { - print("No sphericity corrections found in standard ezANOVA output") - print("This may be because ezANOVA doesn't always display corrections") -} - -# Alternative: Get detailed sphericity information -print("\nDetailed Sphericity Information:") -print("Mauchly's Test Results:") -print(ez_corrected$Mauchly) - -# Check if we need to manually apply corrections -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package -library(car) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Load effectsize package for proper effect size calculations -library(effectsize) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915121201.r b/.history/eohi1/mixed anova - domain means_20250915121201.r deleted file mode 100644 index 60fdf68..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915121201.r +++ /dev/null @@ -1,774 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Load required library for sphericity tests -library(ez) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Try different ways to access sphericity corrections -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print(ez_corrected$`Sphericity Corrections`) -} else if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print(ez_corrected$`Sphericity Corrections`) -} else if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print(ez_corrected$`Sphericity Corrections`) -} else { - print("No sphericity corrections found in standard ezANOVA output") - print("This may be because ezANOVA doesn't always display corrections") -} - -# Alternative: Get detailed sphericity information -print("\nDetailed Sphericity Information:") -print("Mauchly's Test Results:") -print(ez_corrected$Mauchly) - -# Check if we need to manually apply corrections -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package -library(car) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Load effectsize package for proper effect size calculations -library(effectsize) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915121212.r b/.history/eohi1/mixed anova - domain means_20250915121212.r deleted file mode 100644 index 60fdf68..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915121212.r +++ /dev/null @@ -1,774 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Load required library for sphericity tests -library(ez) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Try different ways to access sphericity corrections -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print(ez_corrected$`Sphericity Corrections`) -} else if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print(ez_corrected$`Sphericity Corrections`) -} else if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print(ez_corrected$`Sphericity Corrections`) -} else { - print("No sphericity corrections found in standard ezANOVA output") - print("This may be because ezANOVA doesn't always display corrections") -} - -# Alternative: Get detailed sphericity information -print("\nDetailed Sphericity Information:") -print("Mauchly's Test Results:") -print(ez_corrected$Mauchly) - -# Check if we need to manually apply corrections -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package -library(car) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Load effectsize package for proper effect size calculations -library(effectsize) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915121317.r b/.history/eohi1/mixed anova - domain means_20250915121317.r deleted file mode 100644 index 60fdf68..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915121317.r +++ /dev/null @@ -1,774 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# More efficient data pivoting (avoiding pivot_longer issues) -pivot_domain_means <- function(data, domain_mapping) { - # Pre-allocate the result data frame for better performance - n_rows <- nrow(data) * nrow(domain_mapping) - long_data <- data.frame( - pID = character(n_rows), - ResponseId = character(n_rows), - TEMPORAL_DO = character(n_rows), - TIME = character(n_rows), - DOMAIN = character(n_rows), - MEAN_DIFFERENCE = numeric(n_rows), - stringsAsFactors = FALSE - ) - - row_idx <- 1 - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Get the number of rows for this variable - n_data_rows <- nrow(data) - - # Fill in the data for this variable - long_data$pID[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$pID) - long_data$ResponseId[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$ResponseId) - long_data$TEMPORAL_DO[row_idx:(row_idx + n_data_rows - 1)] <- as.character(data$TEMPORAL_DO) - long_data$TIME[row_idx:(row_idx + n_data_rows - 1)] <- time_level - long_data$DOMAIN[row_idx:(row_idx + n_data_rows - 1)] <- domain_level - long_data$MEAN_DIFFERENCE[row_idx:(row_idx + n_data_rows - 1)] <- data[[var_name]] - - row_idx <- row_idx + n_data_rows - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Load required library for sphericity tests -library(ez) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Try different ways to access sphericity corrections -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print(ez_corrected$`Sphericity Corrections`) -} else if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print(ez_corrected$`Sphericity Corrections`) -} else if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print(ez_corrected$`Sphericity Corrections`) -} else { - print("No sphericity corrections found in standard ezANOVA output") - print("This may be because ezANOVA doesn't always display corrections") -} - -# Alternative: Get detailed sphericity information -print("\nDetailed Sphericity Information:") -print("Mauchly's Test Results:") -print(ez_corrected$Mauchly) - -# Check if we need to manually apply corrections -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package -library(car) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Load effectsize package for proper effect size calculations -library(effectsize) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915122220.r b/.history/eohi1/mixed anova - domain means_20250915122220.r deleted file mode 100644 index 0162fb8..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122220.r +++ /dev/null @@ -1,736 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Remove any rows with missing values - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Load required library for sphericity tests -library(ez) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Try different ways to access sphericity corrections -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print(ez_corrected$`Sphericity Corrections`) -} else if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print(ez_corrected$`Sphericity Corrections`) -} else if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print(ez_corrected$`Sphericity Corrections`) -} else { - print("No sphericity corrections found in standard ezANOVA output") - print("This may be because ezANOVA doesn't always display corrections") -} - -# Alternative: Get detailed sphericity information -print("\nDetailed Sphericity Information:") -print("Mauchly's Test Results:") -print(ez_corrected$Mauchly) - -# Check if we need to manually apply corrections -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package -library(car) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Load effectsize package for proper effect size calculations -library(effectsize) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915122231.r b/.history/eohi1/mixed anova - domain means_20250915122231.r deleted file mode 100644 index 88698a6..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122231.r +++ /dev/null @@ -1,719 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Remove any rows with missing values - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Load required library for sphericity tests -library(ez) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package -library(car) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Load effectsize package for proper effect size calculations -library(effectsize) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Two-way interactions -print("\nTIME × DOMAIN Interaction:") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -time_domain_contrasts <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_contrasts) - -print("\nTEMPORAL_DO × TIME Interaction:") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -temporal_time_contrasts <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_contrasts) - -print("\nTEMPORAL_DO × DOMAIN Interaction:") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -temporal_domain_contrasts <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_contrasts) - -# Three-way interaction -print("\nTEMPORAL_DO × TIME × DOMAIN Interaction:") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915122243.r b/.history/eohi1/mixed anova - domain means_20250915122243.r deleted file mode 100644 index 1c32062..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122243.r +++ /dev/null @@ -1,691 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Remove any rows with missing values - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Load required library for sphericity tests -library(ez) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package -library(car) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Load effectsize package for proper effect size calculations -library(effectsize) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - diff --git a/.history/eohi1/mixed anova - domain means_20250915122255.r b/.history/eohi1/mixed anova - domain means_20250915122255.r deleted file mode 100644 index 834ff6c..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122255.r +++ /dev/null @@ -1,750 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Remove any rows with missing values - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Load required library for sphericity tests -library(ez) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package -library(car) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Load effectsize package for proper effect size calculations -library(effectsize) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS -# ============================================================================= - -# Load effsize package for Cohen's d calculations -library(effsize) - -print("\n=== COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS ===") - -# Extract significant comparisons from three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_pairs <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - temporal_do <- as.character(comparison$TEMPORAL_DO) - time <- as.character(comparison$TIME) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - domain1 <- trimws(contrast_parts[1]) - domain2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - data1 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain1] - - data2 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain2] - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s | %s, %s\n", contrast_name, temporal_do, time)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } -} else { - cat("No significant pairwise comparisons found in three-way interaction.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20250915122301.r b/.history/eohi1/mixed anova - domain means_20250915122301.r deleted file mode 100644 index 094308f..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122301.r +++ /dev/null @@ -1,754 +0,0 @@ -# mixed anova not working -# 12/09/2025 -# add sum contrasts - -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(lme4) # For mixed models -library(lmerTest) # For mixed model significance tests - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Remove any rows with missing values - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Load required library for sphericity tests -library(ez) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package -library(car) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Load effectsize package for proper effect size calculations -library(effectsize) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS -# ============================================================================= - -# Load effsize package for Cohen's d calculations -library(effsize) - -print("\n=== COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS ===") - -# Extract significant comparisons from three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_pairs <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - temporal_do <- as.character(comparison$TEMPORAL_DO) - time <- as.character(comparison$TIME) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - domain1 <- trimws(contrast_parts[1]) - domain2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - data1 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain1] - - data2 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain2] - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s | %s, %s\n", contrast_name, temporal_do, time)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } -} else { - cat("No significant pairwise comparisons found in three-way interaction.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20250915122305.r b/.history/eohi1/mixed anova - domain means_20250915122305.r deleted file mode 100644 index edfb5c8..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122305.r +++ /dev/null @@ -1,749 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(lme4) # For mixed models -library(lmerTest) # For mixed model significance tests - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Remove any rows with missing values - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package -library(car) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Load effectsize package for proper effect size calculations -library(effectsize) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS -# ============================================================================= - -# Load effsize package for Cohen's d calculations -library(effsize) - -print("\n=== COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS ===") - -# Extract significant comparisons from three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_pairs <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - temporal_do <- as.character(comparison$TEMPORAL_DO) - time <- as.character(comparison$TIME) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - domain1 <- trimws(contrast_parts[1]) - domain2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - data1 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain1] - - data2 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain2] - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s | %s, %s\n", contrast_name, temporal_do, time)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } -} else { - cat("No significant pairwise comparisons found in three-way interaction.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20250915122307.r b/.history/eohi1/mixed anova - domain means_20250915122307.r deleted file mode 100644 index 914f788..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122307.r +++ /dev/null @@ -1,748 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(lme4) # For mixed models -library(lmerTest) # For mixed model significance tests - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Remove any rows with missing values - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Load effectsize package for proper effect size calculations -library(effectsize) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS -# ============================================================================= - -# Load effsize package for Cohen's d calculations -library(effsize) - -print("\n=== COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS ===") - -# Extract significant comparisons from three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_pairs <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - temporal_do <- as.character(comparison$TEMPORAL_DO) - time <- as.character(comparison$TIME) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - domain1 <- trimws(contrast_parts[1]) - domain2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - data1 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain1] - - data2 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain2] - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s | %s, %s\n", contrast_name, temporal_do, time)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } -} else { - cat("No significant pairwise comparisons found in three-way interaction.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20250915122312.r b/.history/eohi1/mixed anova - domain means_20250915122312.r deleted file mode 100644 index 02e6f43..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122312.r +++ /dev/null @@ -1,747 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(lme4) # For mixed models -library(lmerTest) # For mixed model significance tests - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Remove any rows with missing values - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Load required libraries for mixed models -library(lme4) -library(lmerTest) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS -# ============================================================================= - -# Load effsize package for Cohen's d calculations -library(effsize) - -print("\n=== COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS ===") - -# Extract significant comparisons from three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_pairs <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - temporal_do <- as.character(comparison$TEMPORAL_DO) - time <- as.character(comparison$TIME) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - domain1 <- trimws(contrast_parts[1]) - domain2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - data1 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain1] - - data2 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain2] - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s | %s, %s\n", contrast_name, temporal_do, time)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } -} else { - cat("No significant pairwise comparisons found in three-way interaction.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20250915122315.r b/.history/eohi1/mixed anova - domain means_20250915122315.r deleted file mode 100644 index b6eca40..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122315.r +++ /dev/null @@ -1,745 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(lme4) # For mixed models -library(lmerTest) # For mixed model significance tests - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Remove any rows with missing values - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Mixed models (libraries already loaded) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS -# ============================================================================= - -# Load effsize package for Cohen's d calculations -library(effsize) - -print("\n=== COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS ===") - -# Extract significant comparisons from three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_pairs <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - temporal_do <- as.character(comparison$TEMPORAL_DO) - time <- as.character(comparison$TIME) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - domain1 <- trimws(contrast_parts[1]) - domain2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - data1 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain1] - - data2 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain2] - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s | %s, %s\n", contrast_name, temporal_do, time)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } -} else { - cat("No significant pairwise comparisons found in three-way interaction.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20250915122318.r b/.history/eohi1/mixed anova - domain means_20250915122318.r deleted file mode 100644 index bebe7e3..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122318.r +++ /dev/null @@ -1,744 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(lme4) # For mixed models -library(lmerTest) # For mixed model significance tests - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Remove any rows with missing values - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - group_by(!!sym(group_var)) %>% - summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Mixed models (libraries already loaded) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS ===") - -# Extract significant comparisons from three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_pairs <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - temporal_do <- as.character(comparison$TEMPORAL_DO) - time <- as.character(comparison$TIME) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - domain1 <- trimws(contrast_parts[1]) - domain2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - data1 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain1] - - data2 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain2] - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s | %s, %s\n", contrast_name, temporal_do, time)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } -} else { - cat("No significant pairwise comparisons found in three-way interaction.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20250915122328.r b/.history/eohi1/mixed anova - domain means_20250915122328.r deleted file mode 100644 index d3eac74..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122328.r +++ /dev/null @@ -1,744 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(lme4) # For mixed models -library(lmerTest) # For mixed model significance tests - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Remove any rows with missing values - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Mixed models (libraries already loaded) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS ===") - -# Extract significant comparisons from three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_pairs <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - temporal_do <- as.character(comparison$TEMPORAL_DO) - time <- as.character(comparison$TIME) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - domain1 <- trimws(contrast_parts[1]) - domain2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - data1 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain1] - - data2 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain2] - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s | %s, %s\n", contrast_name, temporal_do, time)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } -} else { - cat("No significant pairwise comparisons found in three-way interaction.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20250915122355.r b/.history/eohi1/mixed anova - domain means_20250915122355.r deleted file mode 100644 index d3eac74..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122355.r +++ /dev/null @@ -1,744 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(lme4) # For mixed models -library(lmerTest) # For mixed model significance tests - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Remove any rows with missing values - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Mixed models (libraries already loaded) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS ===") - -# Extract significant comparisons from three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_pairs <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - temporal_do <- as.character(comparison$TEMPORAL_DO) - time <- as.character(comparison$TIME) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - domain1 <- trimws(contrast_parts[1]) - domain2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - data1 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain1] - - data2 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain2] - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s | %s, %s\n", contrast_name, temporal_do, time)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } -} else { - cat("No significant pairwise comparisons found in three-way interaction.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20250915122357.r b/.history/eohi1/mixed anova - domain means_20250915122357.r deleted file mode 100644 index d3eac74..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122357.r +++ /dev/null @@ -1,744 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(lme4) # For mixed models -library(lmerTest) # For mixed model significance tests - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Remove any rows with missing values - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# ALTERNATIVE: MIXED MODEL USING LMER (FASTER) -# ============================================================================= - -# Mixed models (libraries already loaded) - -print("Running alternative mixed model using lmer()...") -start_time_lmer <- Sys.time() - -# Mixed model approach (much faster than aov with complex error structures) -mixed_lmer_model <- lmer(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + - (1|pID) + (1|pID:TIME) + (1|pID:DOMAIN), - data = long_data_clean) - -end_time_lmer <- Sys.time() -print(paste("lmer model completed in:", round(end_time_lmer - start_time_lmer, 2), "seconds")) - -print("Mixed Model Results (lmer):") -print(summary(mixed_lmer_model)) - -# ANOVA table for lmer model -print("ANOVA Table for Mixed Model:") -print(anova(mixed_lmer_model)) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS ===") - -# Extract significant comparisons from three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_pairs <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - temporal_do <- as.character(comparison$TEMPORAL_DO) - time <- as.character(comparison$TIME) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - domain1 <- trimws(contrast_parts[1]) - domain2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - data1 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain1] - - data2 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain2] - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s | %s, %s\n", contrast_name, temporal_do, time)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } -} else { - cat("No significant pairwise comparisons found in three-way interaction.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20250915122438.r b/.history/eohi1/mixed anova - domain means_20250915122438.r deleted file mode 100644 index 16cf8fe..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122438.r +++ /dev/null @@ -1,734 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(lme4) # For mixed models -library(lmerTest) # For mixed model significance tests - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Remove any rows with missing values - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS ===") - -# Extract significant comparisons from three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_pairs <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - temporal_do <- as.character(comparison$TEMPORAL_DO) - time <- as.character(comparison$TIME) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - domain1 <- trimws(contrast_parts[1]) - domain2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - data1 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain1] - - data2 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain2] - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s | %s, %s\n", contrast_name, temporal_do, time)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } -} else { - cat("No significant pairwise comparisons found in three-way interaction.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20250915122442.r b/.history/eohi1/mixed anova - domain means_20250915122442.r deleted file mode 100644 index 1110c1a..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122442.r +++ /dev/null @@ -1,732 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Remove any rows with missing values - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS ===") - -# Extract significant comparisons from three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_pairs <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - temporal_do <- as.character(comparison$TEMPORAL_DO) - time <- as.character(comparison$TIME) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - domain1 <- trimws(contrast_parts[1]) - domain2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - data1 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain1] - - data2 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain2] - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s | %s, %s\n", contrast_name, temporal_do, time)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } -} else { - cat("No significant pairwise comparisons found in three-way interaction.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20250915122447.r b/.history/eohi1/mixed anova - domain means_20250915122447.r deleted file mode 100644 index 8e18845..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122447.r +++ /dev/null @@ -1,733 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Remove any rows with missing values - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results:") -print(summary(mixed_anova_model)) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS ===") - -# Extract significant comparisons from three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_pairs <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - temporal_do <- as.character(comparison$TEMPORAL_DO) - time <- as.character(comparison$TIME) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - domain1 <- trimws(contrast_parts[1]) - domain2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - data1 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain1] - - data2 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain2] - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s | %s, %s\n", contrast_name, temporal_do, time)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } -} else { - cat("No significant pairwise comparisons found in three-way interaction.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20250915122455.r b/.history/eohi1/mixed anova - domain means_20250915122455.r deleted file mode 100644 index 2228623..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122455.r +++ /dev/null @@ -1,744 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Remove any rows with missing values - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS ===") - -# Extract significant comparisons from three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_pairs <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - temporal_do <- as.character(comparison$TEMPORAL_DO) - time <- as.character(comparison$TIME) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - domain1 <- trimws(contrast_parts[1]) - domain2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - data1 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain1] - - data2 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain2] - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s | %s, %s\n", contrast_name, temporal_do, time)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } -} else { - cat("No significant pairwise comparisons found in three-way interaction.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20250915122501.r b/.history/eohi1/mixed anova - domain means_20250915122501.r deleted file mode 100644 index 2228623..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122501.r +++ /dev/null @@ -1,744 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Remove any rows with missing values - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS ===") - -# Extract significant comparisons from three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_pairs <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - temporal_do <- as.character(comparison$TEMPORAL_DO) - time <- as.character(comparison$TIME) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - domain1 <- trimws(contrast_parts[1]) - domain2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - data1 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain1] - - data2 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain2] - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s | %s, %s\n", contrast_name, temporal_do, time)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } -} else { - cat("No significant pairwise comparisons found in three-way interaction.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20250915122534.r b/.history/eohi1/mixed anova - domain means_20250915122534.r deleted file mode 100644 index e505219..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122534.r +++ /dev/null @@ -1,743 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Remove any rows with missing values - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS ===") - -# Extract significant comparisons from three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_pairs <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - temporal_do <- as.character(comparison$TEMPORAL_DO) - time <- as.character(comparison$TIME) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - domain1 <- trimws(contrast_parts[1]) - domain2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - data1 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain1] - - data2 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain2] - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s | %s, %s\n", contrast_name, temporal_do, time)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } -} else { - cat("No significant pairwise comparisons found in three-way interaction.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20250915122540.r b/.history/eohi1/mixed anova - domain means_20250915122540.r deleted file mode 100644 index e505219..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122540.r +++ /dev/null @@ -1,743 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Remove any rows with missing values - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS ===") - -# Extract significant comparisons from three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_pairs <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - temporal_do <- as.character(comparison$TEMPORAL_DO) - time <- as.character(comparison$TIME) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - domain1 <- trimws(contrast_parts[1]) - domain2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - data1 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain1] - - data2 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain2] - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s | %s, %s\n", contrast_name, temporal_do, time)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } -} else { - cat("No significant pairwise comparisons found in three-way interaction.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20250915122551.r b/.history/eohi1/mixed anova - domain means_20250915122551.r deleted file mode 100644 index e505219..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122551.r +++ /dev/null @@ -1,743 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Remove any rows with missing values - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS ===") - -# Extract significant comparisons from three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_pairs <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - temporal_do <- as.character(comparison$TEMPORAL_DO) - time <- as.character(comparison$TIME) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - domain1 <- trimws(contrast_parts[1]) - domain2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - data1 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain1] - - data2 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain2] - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s | %s, %s\n", contrast_name, temporal_do, time)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } -} else { - cat("No significant pairwise comparisons found in three-way interaction.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20250915122806.r b/.history/eohi1/mixed anova - domain means_20250915122806.r deleted file mode 100644 index a7196b2..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122806.r +++ /dev/null @@ -1,743 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS ===") - -# Extract significant comparisons from three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_pairs <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - temporal_do <- as.character(comparison$TEMPORAL_DO) - time <- as.character(comparison$TIME) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - domain1 <- trimws(contrast_parts[1]) - domain2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - data1 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain1] - - data2 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain2] - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s | %s, %s\n", contrast_name, temporal_do, time)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } -} else { - cat("No significant pairwise comparisons found in three-way interaction.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20250915122811.r b/.history/eohi1/mixed anova - domain means_20250915122811.r deleted file mode 100644 index a7196b2..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122811.r +++ /dev/null @@ -1,743 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS ===") - -# Extract significant comparisons from three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_pairs <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - temporal_do <- as.character(comparison$TEMPORAL_DO) - time <- as.character(comparison$TIME) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - domain1 <- trimws(contrast_parts[1]) - domain2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - data1 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain1] - - data2 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain2] - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s | %s, %s\n", contrast_name, temporal_do, time)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } -} else { - cat("No significant pairwise comparisons found in three-way interaction.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20250915122819.r b/.history/eohi1/mixed anova - domain means_20250915122819.r deleted file mode 100644 index a7196b2..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122819.r +++ /dev/null @@ -1,743 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS ===") - -# Extract significant comparisons from three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_pairs <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - temporal_do <- as.character(comparison$TEMPORAL_DO) - time <- as.character(comparison$TIME) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - domain1 <- trimws(contrast_parts[1]) - domain2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - data1 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain1] - - data2 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain2] - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s | %s, %s\n", contrast_name, temporal_do, time)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } -} else { - cat("No significant pairwise comparisons found in three-way interaction.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20250915122909.r b/.history/eohi1/mixed anova - domain means_20250915122909.r deleted file mode 100644 index c010c5e..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122909.r +++ /dev/null @@ -1,743 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS ===") - -# Extract significant comparisons from three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_pairs <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - temporal_do <- as.character(comparison$TEMPORAL_DO) - time <- as.character(comparison$TIME) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - domain1 <- trimws(contrast_parts[1]) - domain2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - data1 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain1] - - data2 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain2] - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s | %s, %s\n", contrast_name, temporal_do, time)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } -} else { - cat("No significant pairwise comparisons found in three-way interaction.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20250915122915.r b/.history/eohi1/mixed anova - domain means_20250915122915.r deleted file mode 100644 index c010c5e..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122915.r +++ /dev/null @@ -1,743 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS ===") - -# Extract significant comparisons from three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_pairs <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - temporal_do <- as.character(comparison$TEMPORAL_DO) - time <- as.character(comparison$TIME) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - domain1 <- trimws(contrast_parts[1]) - domain2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - data1 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain1] - - data2 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain2] - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s | %s, %s\n", contrast_name, temporal_do, time)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } -} else { - cat("No significant pairwise comparisons found in three-way interaction.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20250915122919.r b/.history/eohi1/mixed anova - domain means_20250915122919.r deleted file mode 100644 index c010c5e..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915122919.r +++ /dev/null @@ -1,743 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS ===") - -# Extract significant comparisons from three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_pairs <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - temporal_do <- as.character(comparison$TEMPORAL_DO) - time <- as.character(comparison$TIME) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - domain1 <- trimws(contrast_parts[1]) - domain2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - data1 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain1] - - data2 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain2] - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s | %s, %s\n", contrast_name, temporal_do, time)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } -} else { - cat("No significant pairwise comparisons found in three-way interaction.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20250915132646.r b/.history/eohi1/mixed anova - domain means_20250915132646.r deleted file mode 100644 index c010c5e..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915132646.r +++ /dev/null @@ -1,743 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT PAIRWISE COMPARISONS ===") - -# Extract significant comparisons from three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_pairs <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - temporal_do <- as.character(comparison$TEMPORAL_DO) - time <- as.character(comparison$TIME) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - domain1 <- trimws(contrast_parts[1]) - domain2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - data1 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain1] - - data2 <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do & - long_data_clean$TIME == time & - long_data_clean$DOMAIN == domain2] - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s | %s, %s\n", contrast_name, temporal_do, time)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } -} else { - cat("No significant pairwise comparisons found in three-way interaction.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20250915133137.r b/.history/eohi1/mixed anova - domain means_20250915133137.r deleted file mode 100644 index 22df503..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915133137.r +++ /dev/null @@ -1,791 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20250915133143.r b/.history/eohi1/mixed anova - domain means_20250915133143.r deleted file mode 100644 index 22df503..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915133143.r +++ /dev/null @@ -1,791 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20250915133150.r b/.history/eohi1/mixed anova - domain means_20250915133150.r deleted file mode 100644 index 22df503..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915133150.r +++ /dev/null @@ -1,791 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20250915133657.r b/.history/eohi1/mixed anova - domain means_20250915133657.r deleted file mode 100644 index 22df503..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915133657.r +++ /dev/null @@ -1,791 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20250915133739.r b/.history/eohi1/mixed anova - domain means_20250915133739.r deleted file mode 100644 index 22df503..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915133739.r +++ /dev/null @@ -1,791 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20250915133916.r b/.history/eohi1/mixed anova - domain means_20250915133916.r deleted file mode 100644 index 22df503..0000000 --- a/.history/eohi1/mixed anova - domain means_20250915133916.r +++ /dev/null @@ -1,791 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20250916095306.r b/.history/eohi1/mixed anova - domain means_20250916095306.r deleted file mode 100644 index 22df503..0000000 --- a/.history/eohi1/mixed anova - domain means_20250916095306.r +++ /dev/null @@ -1,791 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20250916100623.r b/.history/eohi1/mixed anova - domain means_20250916100623.r deleted file mode 100644 index a5cb647..0000000 --- a/.history/eohi1/mixed anova - domain means_20250916100623.r +++ /dev/null @@ -1,830 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20250916100629.r b/.history/eohi1/mixed anova - domain means_20250916100629.r deleted file mode 100644 index a5cb647..0000000 --- a/.history/eohi1/mixed anova - domain means_20250916100629.r +++ /dev/null @@ -1,830 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20250916100739.r b/.history/eohi1/mixed anova - domain means_20250916100739.r deleted file mode 100644 index a5cb647..0000000 --- a/.history/eohi1/mixed anova - domain means_20250916100739.r +++ /dev/null @@ -1,830 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20250916104746.r b/.history/eohi1/mixed anova - domain means_20250916104746.r deleted file mode 100644 index 851d5c4..0000000 --- a/.history/eohi1/mixed anova - domain means_20250916104746.r +++ /dev/null @@ -1,836 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20250916104757.r b/.history/eohi1/mixed anova - domain means_20250916104757.r deleted file mode 100644 index 851d5c4..0000000 --- a/.history/eohi1/mixed anova - domain means_20250916104757.r +++ /dev/null @@ -1,836 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20250916104830.r b/.history/eohi1/mixed anova - domain means_20250916104830.r deleted file mode 100644 index 851d5c4..0000000 --- a/.history/eohi1/mixed anova - domain means_20250916104830.r +++ /dev/null @@ -1,836 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20250917120959.r b/.history/eohi1/mixed anova - domain means_20250917120959.r deleted file mode 100644 index 851d5c4..0000000 --- a/.history/eohi1/mixed anova - domain means_20250917120959.r +++ /dev/null @@ -1,836 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20251001163157.r b/.history/eohi1/mixed anova - domain means_20251001163157.r deleted file mode 100644 index 851d5c4..0000000 --- a/.history/eohi1/mixed anova - domain means_20251001163157.r +++ /dev/null @@ -1,836 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20251001171405.r b/.history/eohi1/mixed anova - domain means_20251001171405.r deleted file mode 100644 index c290278..0000000 --- a/.history/eohi1/mixed anova - domain means_20251001171405.r +++ /dev/null @@ -1,800 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20251001171456.r b/.history/eohi1/mixed anova - domain means_20251001171456.r deleted file mode 100644 index 55b5c9f..0000000 --- a/.history/eohi1/mixed anova - domain means_20251001171456.r +++ /dev/null @@ -1,645 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -print(mixed_anova_model$ANOVA) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20251001171513.r b/.history/eohi1/mixed anova - domain means_20251001171513.r deleted file mode 100644 index 9cd02f6..0000000 --- a/.history/eohi1/mixed anova - domain means_20251001171513.r +++ /dev/null @@ -1,610 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -print(mixed_anova_model$ANOVA) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -print("Generalized Eta Squared:") -print(round(effect_sizes, 5)) - -# Note: Type III ANOVA with sphericity corrections is appropriate for this balanced design. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20251001171529.r b/.history/eohi1/mixed anova - domain means_20251001171529.r deleted file mode 100644 index ab9719e..0000000 --- a/.history/eohi1/mixed anova - domain means_20251001171529.r +++ /dev/null @@ -1,614 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -print(mixed_anova_model$ANOVA) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -print("Generalized Eta Squared:") -print(round(effect_sizes, 5)) - -# Note: Type III ANOVA with sphericity corrections is appropriate for this balanced design. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20251001171548.r b/.history/eohi1/mixed anova - domain means_20251001171548.r deleted file mode 100644 index b2c4c25..0000000 --- a/.history/eohi1/mixed anova - domain means_20251001171548.r +++ /dev/null @@ -1,614 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -print(mixed_anova_model$ANOVA) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -print("Generalized Eta Squared:") -print(round(effect_sizes, 5)) - -# Note: Type III ANOVA with sphericity corrections is appropriate for this balanced design. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20251001171605.r b/.history/eohi1/mixed anova - domain means_20251001171605.r deleted file mode 100644 index 4512c36..0000000 --- a/.history/eohi1/mixed anova - domain means_20251001171605.r +++ /dev/null @@ -1,614 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -print(mixed_anova_model$ANOVA) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -print("Generalized Eta Squared:") -print(round(effect_sizes, 5)) - -# Note: Type III ANOVA with sphericity corrections is appropriate for this balanced design. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20251001171616.r b/.history/eohi1/mixed anova - domain means_20251001171616.r deleted file mode 100644 index ce86031..0000000 --- a/.history/eohi1/mixed anova - domain means_20251001171616.r +++ /dev/null @@ -1,614 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -print(mixed_anova_model$ANOVA) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -print("Generalized Eta Squared:") -print(round(effect_sizes, 5)) - -# Note: Type III ANOVA with sphericity corrections is appropriate for this balanced design. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20251001171654.r b/.history/eohi1/mixed anova - domain means_20251001171654.r deleted file mode 100644 index ce86031..0000000 --- a/.history/eohi1/mixed anova - domain means_20251001171654.r +++ /dev/null @@ -1,614 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -print(mixed_anova_model$ANOVA) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -print("Generalized Eta Squared:") -print(round(effect_sizes, 5)) - -# Note: Type III ANOVA with sphericity corrections is appropriate for this balanced design. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20251001171824.r b/.history/eohi1/mixed anova - domain means_20251001171824.r deleted file mode 100644 index ac148bb..0000000 --- a/.history/eohi1/mixed anova - domain means_20251001171824.r +++ /dev/null @@ -1,615 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -print(mixed_anova_model$ANOVA) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# Note: Type III ANOVA with sphericity corrections is appropriate for this balanced design. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20251001171840.r b/.history/eohi1/mixed anova - domain means_20251001171840.r deleted file mode 100644 index ac148bb..0000000 --- a/.history/eohi1/mixed anova - domain means_20251001171840.r +++ /dev/null @@ -1,615 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -print(mixed_anova_model$ANOVA) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# Note: Type III ANOVA with sphericity corrections is appropriate for this balanced design. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20251001172039.r b/.history/eohi1/mixed anova - domain means_20251001172039.r deleted file mode 100644 index ac148bb..0000000 --- a/.history/eohi1/mixed anova - domain means_20251001172039.r +++ /dev/null @@ -1,615 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -print(mixed_anova_model$ANOVA) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# Note: Type III ANOVA with sphericity corrections is appropriate for this balanced design. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20251001174736.r b/.history/eohi1/mixed anova - domain means_20251001174736.r deleted file mode 100644 index 4c4b463..0000000 --- a/.history/eohi1/mixed anova - domain means_20251001174736.r +++ /dev/null @@ -1,615 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -print(mixed_anova_model$ANOVA) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# Note: Type III ANOVA with sphericity corrections is appropriate for this balanced design. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20251001174748.r b/.history/eohi1/mixed anova - domain means_20251001174748.r deleted file mode 100644 index 4c4b463..0000000 --- a/.history/eohi1/mixed anova - domain means_20251001174748.r +++ /dev/null @@ -1,615 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -print(mixed_anova_model$ANOVA) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# Note: Type III ANOVA with sphericity corrections is appropriate for this balanced design. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20251001174749.r b/.history/eohi1/mixed anova - domain means_20251001174749.r deleted file mode 100644 index 4c4b463..0000000 --- a/.history/eohi1/mixed anova - domain means_20251001174749.r +++ /dev/null @@ -1,615 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -print(mixed_anova_model$ANOVA) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# Note: Type III ANOVA with sphericity corrections is appropriate for this balanced design. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20251003122510.r b/.history/eohi1/mixed anova - domain means_20251003122510.r deleted file mode 100644 index 07396db..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003122510.r +++ /dev/null @@ -1,617 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# Note: Type III ANOVA with sphericity corrections is appropriate for this balanced design. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20251003122522.r b/.history/eohi1/mixed anova - domain means_20251003122522.r deleted file mode 100644 index 613ce86..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003122522.r +++ /dev/null @@ -1,638 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# Note: Type III ANOVA with sphericity corrections is appropriate for this balanced design. - -# ============================================================================= -# SUMMARY OF KEY FINDINGS -# ============================================================================= - -print("\n=== SUMMARY OF KEY FINDINGS ===") -cat("SIGNIFICANT EFFECTS (p < 0.05):\n") -cat("• TIME main effect: F(1,1061) = 78.80, p < 0.001, η²G = 0.008\n") -cat("• DOMAIN main effect: F(3,3183) = 206.90, p < 0.001, η²G = 0.059\n") -cat("• TEMPORAL_DO × TIME interaction: F(1,1061) = 10.97, p = 0.001, η²G = 0.001\n") -cat("• TIME × DOMAIN interaction: F(3,3183) = 3.65, p = 0.012, η²G = 0.001\n") -cat("\nMARGINAL EFFECTS (p < 0.10):\n") -cat("• TEMPORAL_DO × DOMAIN interaction: F(3,3183) = 2.50, p = 0.058, η²G = 0.001\n") -cat("\nNON-SIGNIFICANT EFFECTS:\n") -cat("• TEMPORAL_DO main effect: F(1,1061) = 0.41, p = 0.524, η²G < 0.001\n") -cat("• Three-way interaction: F(3,3183) = 0.77, p = 0.511, η²G < 0.001\n") -cat("\nEFFECT SIZE INTERPRETATION (η²G):\n") -cat("• Large: η²G > 0.14\n") -cat("• Medium: η²G > 0.06\n") -cat("• Small: η²G > 0.01\n") -cat("• Negligible: η²G ≤ 0.01\n") - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20251003122534.r b/.history/eohi1/mixed anova - domain means_20251003122534.r deleted file mode 100644 index 613ce86..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003122534.r +++ /dev/null @@ -1,638 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# Note: Type III ANOVA with sphericity corrections is appropriate for this balanced design. - -# ============================================================================= -# SUMMARY OF KEY FINDINGS -# ============================================================================= - -print("\n=== SUMMARY OF KEY FINDINGS ===") -cat("SIGNIFICANT EFFECTS (p < 0.05):\n") -cat("• TIME main effect: F(1,1061) = 78.80, p < 0.001, η²G = 0.008\n") -cat("• DOMAIN main effect: F(3,3183) = 206.90, p < 0.001, η²G = 0.059\n") -cat("• TEMPORAL_DO × TIME interaction: F(1,1061) = 10.97, p = 0.001, η²G = 0.001\n") -cat("• TIME × DOMAIN interaction: F(3,3183) = 3.65, p = 0.012, η²G = 0.001\n") -cat("\nMARGINAL EFFECTS (p < 0.10):\n") -cat("• TEMPORAL_DO × DOMAIN interaction: F(3,3183) = 2.50, p = 0.058, η²G = 0.001\n") -cat("\nNON-SIGNIFICANT EFFECTS:\n") -cat("• TEMPORAL_DO main effect: F(1,1061) = 0.41, p = 0.524, η²G < 0.001\n") -cat("• Three-way interaction: F(3,3183) = 0.77, p = 0.511, η²G < 0.001\n") -cat("\nEFFECT SIZE INTERPRETATION (η²G):\n") -cat("• Large: η²G > 0.14\n") -cat("• Medium: η²G > 0.06\n") -cat("• Small: η²G > 0.01\n") -cat("• Negligible: η²G ≤ 0.01\n") - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20251003122601.r b/.history/eohi1/mixed anova - domain means_20251003122601.r deleted file mode 100644 index 58552be..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003122601.r +++ /dev/null @@ -1,638 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# Note: Type III ANOVA with sphericity corrections is appropriate for this balanced design. - -# ============================================================================= -# SUMMARY OF KEY FINDINGS -# ============================================================================= - -print("\n=== SUMMARY OF KEY FINDINGS ===") -cat("SIGNIFICANT EFFECTS (p < 0.05):\n") -cat("• TIME main effect: F(1,1061) = 78.80, p < 0.001, η²G = 0.008\n") -cat("• DOMAIN main effect: F(3,3183) = 206.90, p < 0.001, η²G = 0.059\n") -cat("• TEMPORAL_DO × TIME interaction: F(1,1061) = 10.97, p = 0.001, η²G = 0.001\n") -cat("• TIME × DOMAIN interaction: F(3,3183) = 3.65, p = 0.012, η²G = 0.001\n") -cat("\nMARGINAL EFFECTS (p < 0.10):\n") -cat("• TEMPORAL_DO × DOMAIN interaction: F(3,3183) = 2.50, p = 0.058, η²G = 0.001\n") -cat("\nNON-SIGNIFICANT EFFECTS:\n") -cat("• TEMPORAL_DO main effect: F(1,1061) = 0.41, p = 0.524, η²G = 0.00015\n") -cat("• Three-way interaction: F(3,3183) = 0.77, p = 0.511, η²G = 0.00013\n") -cat("\nEFFECT SIZE INTERPRETATION (η²G):\n") -cat("• Large: η²G > 0.14\n") -cat("• Medium: η²G > 0.06\n") -cat("• Small: η²G > 0.01\n") -cat("• Negligible: η²G ≤ 0.01\n") - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20251003122606.r b/.history/eohi1/mixed anova - domain means_20251003122606.r deleted file mode 100644 index 58552be..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003122606.r +++ /dev/null @@ -1,638 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# Note: Type III ANOVA with sphericity corrections is appropriate for this balanced design. - -# ============================================================================= -# SUMMARY OF KEY FINDINGS -# ============================================================================= - -print("\n=== SUMMARY OF KEY FINDINGS ===") -cat("SIGNIFICANT EFFECTS (p < 0.05):\n") -cat("• TIME main effect: F(1,1061) = 78.80, p < 0.001, η²G = 0.008\n") -cat("• DOMAIN main effect: F(3,3183) = 206.90, p < 0.001, η²G = 0.059\n") -cat("• TEMPORAL_DO × TIME interaction: F(1,1061) = 10.97, p = 0.001, η²G = 0.001\n") -cat("• TIME × DOMAIN interaction: F(3,3183) = 3.65, p = 0.012, η²G = 0.001\n") -cat("\nMARGINAL EFFECTS (p < 0.10):\n") -cat("• TEMPORAL_DO × DOMAIN interaction: F(3,3183) = 2.50, p = 0.058, η²G = 0.001\n") -cat("\nNON-SIGNIFICANT EFFECTS:\n") -cat("• TEMPORAL_DO main effect: F(1,1061) = 0.41, p = 0.524, η²G = 0.00015\n") -cat("• Three-way interaction: F(3,3183) = 0.77, p = 0.511, η²G = 0.00013\n") -cat("\nEFFECT SIZE INTERPRETATION (η²G):\n") -cat("• Large: η²G > 0.14\n") -cat("• Medium: η²G > 0.06\n") -cat("• Small: η²G > 0.01\n") -cat("• Negligible: η²G ≤ 0.01\n") - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20251003122621.r b/.history/eohi1/mixed anova - domain means_20251003122621.r deleted file mode 100644 index b76a792..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003122621.r +++ /dev/null @@ -1,648 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# Note: Type III ANOVA with sphericity corrections is appropriate for this balanced design. - -# ============================================================================= -# SUMMARY OF KEY FINDINGS -# ============================================================================= - -print("\n=== SUMMARY OF KEY FINDINGS ===") -cat("SIGNIFICANT EFFECTS (p < 0.05):\n") -cat("• TIME main effect: F(1,1061) = 78.80, p < 0.001, η²G = 0.008\n") -cat("• DOMAIN main effect: F(3,3183) = 206.90, p < 0.001, η²G = 0.059\n") -cat("• TEMPORAL_DO × TIME interaction: F(1,1061) = 10.97, p = 0.001, η²G = 0.001\n") -cat("• TIME × DOMAIN interaction: F(3,3183) = 3.65, p = 0.012, η²G = 0.001\n") -cat("\nMARGINAL EFFECTS (p < 0.10):\n") -cat("• TEMPORAL_DO × DOMAIN interaction: F(3,3183) = 2.50, p = 0.058, η²G = 0.001\n") -cat("\nNON-SIGNIFICANT EFFECTS:\n") -cat("• TEMPORAL_DO main effect: F(1,1061) = 0.41, p = 0.524, η²G = 0.00015\n") -cat("• Three-way interaction: F(3,3183) = 0.77, p = 0.511, η²G = 0.00013\n") -cat("\nEFFECT SIZE INTERPRETATION (η²G):\n") -cat("• Large: η²G > 0.14\n") -cat("• Medium: η²G > 0.06\n") -cat("• Small: η²G > 0.01\n") -cat("• Negligible: η²G ≤ 0.01\n") - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -calculate_cohens_d_for_pairs(three_way_contrasts_df, long_data_clean, "TIME", c("TEMPORAL_DO", "DOMAIN"), "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20251003122630.r b/.history/eohi1/mixed anova - domain means_20251003122630.r deleted file mode 100644 index b76a792..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003122630.r +++ /dev/null @@ -1,648 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# Note: Type III ANOVA with sphericity corrections is appropriate for this balanced design. - -# ============================================================================= -# SUMMARY OF KEY FINDINGS -# ============================================================================= - -print("\n=== SUMMARY OF KEY FINDINGS ===") -cat("SIGNIFICANT EFFECTS (p < 0.05):\n") -cat("• TIME main effect: F(1,1061) = 78.80, p < 0.001, η²G = 0.008\n") -cat("• DOMAIN main effect: F(3,3183) = 206.90, p < 0.001, η²G = 0.059\n") -cat("• TEMPORAL_DO × TIME interaction: F(1,1061) = 10.97, p = 0.001, η²G = 0.001\n") -cat("• TIME × DOMAIN interaction: F(3,3183) = 3.65, p = 0.012, η²G = 0.001\n") -cat("\nMARGINAL EFFECTS (p < 0.10):\n") -cat("• TEMPORAL_DO × DOMAIN interaction: F(3,3183) = 2.50, p = 0.058, η²G = 0.001\n") -cat("\nNON-SIGNIFICANT EFFECTS:\n") -cat("• TEMPORAL_DO main effect: F(1,1061) = 0.41, p = 0.524, η²G = 0.00015\n") -cat("• Three-way interaction: F(3,3183) = 0.77, p = 0.511, η²G = 0.00013\n") -cat("\nEFFECT SIZE INTERPRETATION (η²G):\n") -cat("• Large: η²G > 0.14\n") -cat("• Medium: η²G > 0.06\n") -cat("• Small: η²G > 0.01\n") -cat("• Negligible: η²G ≤ 0.01\n") - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -calculate_cohens_d_for_pairs(three_way_contrasts_df, long_data_clean, "TIME", c("TEMPORAL_DO", "DOMAIN"), "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20251003122640.r b/.history/eohi1/mixed anova - domain means_20251003122640.r deleted file mode 100644 index b76a792..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003122640.r +++ /dev/null @@ -1,648 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# Note: Type III ANOVA with sphericity corrections is appropriate for this balanced design. - -# ============================================================================= -# SUMMARY OF KEY FINDINGS -# ============================================================================= - -print("\n=== SUMMARY OF KEY FINDINGS ===") -cat("SIGNIFICANT EFFECTS (p < 0.05):\n") -cat("• TIME main effect: F(1,1061) = 78.80, p < 0.001, η²G = 0.008\n") -cat("• DOMAIN main effect: F(3,3183) = 206.90, p < 0.001, η²G = 0.059\n") -cat("• TEMPORAL_DO × TIME interaction: F(1,1061) = 10.97, p = 0.001, η²G = 0.001\n") -cat("• TIME × DOMAIN interaction: F(3,3183) = 3.65, p = 0.012, η²G = 0.001\n") -cat("\nMARGINAL EFFECTS (p < 0.10):\n") -cat("• TEMPORAL_DO × DOMAIN interaction: F(3,3183) = 2.50, p = 0.058, η²G = 0.001\n") -cat("\nNON-SIGNIFICANT EFFECTS:\n") -cat("• TEMPORAL_DO main effect: F(1,1061) = 0.41, p = 0.524, η²G = 0.00015\n") -cat("• Three-way interaction: F(3,3183) = 0.77, p = 0.511, η²G = 0.00013\n") -cat("\nEFFECT SIZE INTERPRETATION (η²G):\n") -cat("• Large: η²G > 0.14\n") -cat("• Medium: η²G > 0.06\n") -cat("• Small: η²G > 0.01\n") -cat("• Negligible: η²G ≤ 0.01\n") - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -calculate_cohens_d_for_pairs(three_way_contrasts_df, long_data_clean, "TIME", c("TEMPORAL_DO", "DOMAIN"), "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - domain means_20251003123628.r b/.history/eohi1/mixed anova - domain means_20251003123628.r deleted file mode 100644 index f1bdf09..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003123628.r +++ /dev/null @@ -1,652 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# Note: Type III ANOVA with sphericity corrections is appropriate for this balanced design. - -# ============================================================================= -# SUMMARY OF KEY FINDINGS -# ============================================================================= - -print("\n=== SUMMARY OF KEY FINDINGS ===") -cat("SIGNIFICANT EFFECTS (p < 0.05):\n") -cat("• TIME main effect: F(1,1061) = 78.80, p < 0.001, η²G = 0.008\n") -cat("• DOMAIN main effect: F(3,3183) = 206.90, p < 0.001, η²G = 0.059\n") -cat("• TEMPORAL_DO × TIME interaction: F(1,1061) = 10.97, p = 0.001, η²G = 0.001\n") -cat("• TIME × DOMAIN interaction: F(3,3183) = 3.65, p = 0.012, η²G = 0.001\n") -cat("\nMARGINAL EFFECTS (p < 0.10):\n") -cat("• TEMPORAL_DO × DOMAIN interaction: F(3,3183) = 2.50, p = 0.058, η²G = 0.001\n") -cat("\nNON-SIGNIFICANT EFFECTS:\n") -cat("• TEMPORAL_DO main effect: F(1,1061) = 0.41, p = 0.524, η²G = 0.00015\n") -cat("• Three-way interaction: F(3,3183) = 0.77, p = 0.511, η²G = 0.00013\n") -cat("\nEFFECT SIZE INTERPRETATION (η²G):\n") -cat("• Large: η²G > 0.14\n") -cat("• Medium: η²G > 0.06\n") -cat("• Small: η²G > 0.01\n") -cat("• Negligible: η²G ≤ 0.01\n") - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# For three-way interaction, we need to handle the multiple grouping variables differently -print("Three-way interaction Cohen's d calculations:") -print("Note: Cohen's d for three-way interactions requires more complex calculations") -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - diff --git a/.history/eohi1/mixed anova - domain means_20251003123637.r b/.history/eohi1/mixed anova - domain means_20251003123637.r deleted file mode 100644 index f1bdf09..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003123637.r +++ /dev/null @@ -1,652 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# Note: Type III ANOVA with sphericity corrections is appropriate for this balanced design. - -# ============================================================================= -# SUMMARY OF KEY FINDINGS -# ============================================================================= - -print("\n=== SUMMARY OF KEY FINDINGS ===") -cat("SIGNIFICANT EFFECTS (p < 0.05):\n") -cat("• TIME main effect: F(1,1061) = 78.80, p < 0.001, η²G = 0.008\n") -cat("• DOMAIN main effect: F(3,3183) = 206.90, p < 0.001, η²G = 0.059\n") -cat("• TEMPORAL_DO × TIME interaction: F(1,1061) = 10.97, p = 0.001, η²G = 0.001\n") -cat("• TIME × DOMAIN interaction: F(3,3183) = 3.65, p = 0.012, η²G = 0.001\n") -cat("\nMARGINAL EFFECTS (p < 0.10):\n") -cat("• TEMPORAL_DO × DOMAIN interaction: F(3,3183) = 2.50, p = 0.058, η²G = 0.001\n") -cat("\nNON-SIGNIFICANT EFFECTS:\n") -cat("• TEMPORAL_DO main effect: F(1,1061) = 0.41, p = 0.524, η²G = 0.00015\n") -cat("• Three-way interaction: F(3,3183) = 0.77, p = 0.511, η²G = 0.00013\n") -cat("\nEFFECT SIZE INTERPRETATION (η²G):\n") -cat("• Large: η²G > 0.14\n") -cat("• Medium: η²G > 0.06\n") -cat("• Small: η²G > 0.01\n") -cat("• Negligible: η²G ≤ 0.01\n") - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# For three-way interaction, we need to handle the multiple grouping variables differently -print("Three-way interaction Cohen's d calculations:") -print("Note: Cohen's d for three-way interactions requires more complex calculations") -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - diff --git a/.history/eohi1/mixed anova - domain means_20251003124045.r b/.history/eohi1/mixed anova - domain means_20251003124045.r deleted file mode 100644 index f1bdf09..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003124045.r +++ /dev/null @@ -1,652 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# Note: Type III ANOVA with sphericity corrections is appropriate for this balanced design. - -# ============================================================================= -# SUMMARY OF KEY FINDINGS -# ============================================================================= - -print("\n=== SUMMARY OF KEY FINDINGS ===") -cat("SIGNIFICANT EFFECTS (p < 0.05):\n") -cat("• TIME main effect: F(1,1061) = 78.80, p < 0.001, η²G = 0.008\n") -cat("• DOMAIN main effect: F(3,3183) = 206.90, p < 0.001, η²G = 0.059\n") -cat("• TEMPORAL_DO × TIME interaction: F(1,1061) = 10.97, p = 0.001, η²G = 0.001\n") -cat("• TIME × DOMAIN interaction: F(3,3183) = 3.65, p = 0.012, η²G = 0.001\n") -cat("\nMARGINAL EFFECTS (p < 0.10):\n") -cat("• TEMPORAL_DO × DOMAIN interaction: F(3,3183) = 2.50, p = 0.058, η²G = 0.001\n") -cat("\nNON-SIGNIFICANT EFFECTS:\n") -cat("• TEMPORAL_DO main effect: F(1,1061) = 0.41, p = 0.524, η²G = 0.00015\n") -cat("• Three-way interaction: F(3,3183) = 0.77, p = 0.511, η²G = 0.00013\n") -cat("\nEFFECT SIZE INTERPRETATION (η²G):\n") -cat("• Large: η²G > 0.14\n") -cat("• Medium: η²G > 0.06\n") -cat("• Small: η²G > 0.01\n") -cat("• Negligible: η²G ≤ 0.01\n") - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# For three-way interaction, we need to handle the multiple grouping variables differently -print("Three-way interaction Cohen's d calculations:") -print("Note: Cohen's d for three-way interactions requires more complex calculations") -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - diff --git a/.history/eohi1/mixed anova - domain means_20251003124352.r b/.history/eohi1/mixed anova - domain means_20251003124352.r deleted file mode 100644 index b7fee9d..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003124352.r +++ /dev/null @@ -1,631 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -print("Creating ANOVA model for post-hoc comparisons...") -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) -print("ANOVA model created successfully.") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# For three-way interaction, we need to handle the multiple grouping variables differently -print("Three-way interaction Cohen's d calculations:") -print("Note: Cohen's d for three-way interactions requires more complex calculations") -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - diff --git a/.history/eohi1/mixed anova - domain means_20251003124400.r b/.history/eohi1/mixed anova - domain means_20251003124400.r deleted file mode 100644 index dbacb25..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003124400.r +++ /dev/null @@ -1,631 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -print("Creating ANOVA model for post-hoc comparisons...") -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) -print("ANOVA model created successfully.") - -# Main effect of TIME -print("Calculating TIME main effect...") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nCalculating DOMAIN main effect...") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nCalculating TEMPORAL_DO main effect...") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# For three-way interaction, we need to handle the multiple grouping variables differently -print("Three-way interaction Cohen's d calculations:") -print("Note: Cohen's d for three-way interactions requires more complex calculations") -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - diff --git a/.history/eohi1/mixed anova - domain means_20251003124418.r b/.history/eohi1/mixed anova - domain means_20251003124418.r deleted file mode 100644 index 9cee5f0..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003124418.r +++ /dev/null @@ -1,634 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -print("Creating ANOVA model for post-hoc comparisons...") -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) -print("ANOVA model created successfully.") - -# Main effect of TIME -print("Calculating TIME main effect...") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nCalculating DOMAIN main effect...") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nCalculating TEMPORAL_DO main effect...") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -print("Calculating TEMPORAL_DO × TIME interaction...") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -print("Calculating TIME × DOMAIN interaction...") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -print("Calculating TEMPORAL_DO × DOMAIN interaction...") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# For three-way interaction, we need to handle the multiple grouping variables differently -print("Three-way interaction Cohen's d calculations:") -print("Note: Cohen's d for three-way interactions requires more complex calculations") -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - diff --git a/.history/eohi1/mixed anova - domain means_20251003124429.r b/.history/eohi1/mixed anova - domain means_20251003124429.r deleted file mode 100644 index 7b657fb..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003124429.r +++ /dev/null @@ -1,635 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -print("Creating ANOVA model for post-hoc comparisons...") -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) -print("ANOVA model created successfully.") - -# Main effect of TIME -print("Calculating TIME main effect...") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nCalculating DOMAIN main effect...") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nCalculating TEMPORAL_DO main effect...") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -print("Calculating TEMPORAL_DO × TIME interaction...") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -print("Calculating TIME × DOMAIN interaction...") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -print("Calculating TEMPORAL_DO × DOMAIN interaction...") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -print("Calculating three-way interaction...") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# For three-way interaction, we need to handle the multiple grouping variables differently -print("Three-way interaction Cohen's d calculations:") -print("Note: Cohen's d for three-way interactions requires more complex calculations") -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - diff --git a/.history/eohi1/mixed anova - domain means_20251003124453.r b/.history/eohi1/mixed anova - domain means_20251003124453.r deleted file mode 100644 index df84668..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003124453.r +++ /dev/null @@ -1,638 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS (OPTIMIZED) -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -print("Creating ANOVA model for post-hoc comparisons...") -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) -print("ANOVA model created successfully.") - -# OPTIMIZATION: Use faster adjustment methods and limit calculations -print("Note: Using faster adjustment methods for large dataset...") - -# Main effect of TIME -print("Calculating TIME main effect...") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nCalculating DOMAIN main effect...") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nCalculating TEMPORAL_DO main effect...") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -print("Calculating TEMPORAL_DO × TIME interaction...") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -print("Calculating TIME × DOMAIN interaction...") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -print("Calculating TEMPORAL_DO × DOMAIN interaction...") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -print("Calculating three-way interaction...") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# For three-way interaction, we need to handle the multiple grouping variables differently -print("Three-way interaction Cohen's d calculations:") -print("Note: Cohen's d for three-way interactions requires more complex calculations") -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - diff --git a/.history/eohi1/mixed anova - domain means_20251003124504.r b/.history/eohi1/mixed anova - domain means_20251003124504.r deleted file mode 100644 index bd36651..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003124504.r +++ /dev/null @@ -1,638 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS (OPTIMIZED) -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -print("Creating ANOVA model for post-hoc comparisons...") -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) -print("ANOVA model created successfully.") - -# OPTIMIZATION: Use faster adjustment methods and limit calculations -print("Note: Using faster adjustment methods for large dataset...") - -# Main effect of TIME (FAST: only 2 levels) -print("Calculating TIME main effect...") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "none") # No adjustment needed for 2 levels -print(time_contrasts) - -# Main effect of DOMAIN (FAST: only 4 levels) -print("\nCalculating DOMAIN main effect...") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "tukey") # Faster than Bonferroni -print(domain_contrasts) - -# Main effect of TEMPORAL_DO (FAST: only 2 levels) -print("\nCalculating TEMPORAL_DO main effect...") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "none") # No adjustment needed for 2 levels -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -print("Calculating TEMPORAL_DO × TIME interaction...") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -print("Calculating TIME × DOMAIN interaction...") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -print("Calculating TEMPORAL_DO × DOMAIN interaction...") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -print("Calculating three-way interaction...") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# For three-way interaction, we need to handle the multiple grouping variables differently -print("Three-way interaction Cohen's d calculations:") -print("Note: Cohen's d for three-way interactions requires more complex calculations") -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - diff --git a/.history/eohi1/mixed anova - domain means_20251003124512.r b/.history/eohi1/mixed anova - domain means_20251003124512.r deleted file mode 100644 index a13fba4..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003124512.r +++ /dev/null @@ -1,638 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS (OPTIMIZED) -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -print("Creating ANOVA model for post-hoc comparisons...") -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) -print("ANOVA model created successfully.") - -# OPTIMIZATION: Use faster adjustment methods and limit calculations -print("Note: Using faster adjustment methods for large dataset...") - -# Main effect of TIME (FAST: only 2 levels) -print("Calculating TIME main effect...") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "none") # No adjustment needed for 2 levels -print(time_contrasts) - -# Main effect of DOMAIN (FAST: only 4 levels) -print("\nCalculating DOMAIN main effect...") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "tukey") # Faster than Bonferroni -print(domain_contrasts) - -# Main effect of TEMPORAL_DO (FAST: only 2 levels) -print("\nCalculating TEMPORAL_DO main effect...") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "none") # No adjustment needed for 2 levels -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS (OPTIMIZED) -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (FAST: 2×2 = 4 combinations) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -print("Calculating TEMPORAL_DO × TIME interaction...") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "none") # Only 2 levels each -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "none") # Only 2 levels each -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -print("Calculating TIME × DOMAIN interaction...") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -print("Calculating TEMPORAL_DO × DOMAIN interaction...") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -print("Calculating three-way interaction...") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# For three-way interaction, we need to handle the multiple grouping variables differently -print("Three-way interaction Cohen's d calculations:") -print("Note: Cohen's d for three-way interactions requires more complex calculations") -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - diff --git a/.history/eohi1/mixed anova - domain means_20251003124527.r b/.history/eohi1/mixed anova - domain means_20251003124527.r deleted file mode 100644 index a5d08d4..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003124527.r +++ /dev/null @@ -1,638 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS (OPTIMIZED) -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -print("Creating ANOVA model for post-hoc comparisons...") -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) -print("ANOVA model created successfully.") - -# OPTIMIZATION: Use faster adjustment methods and limit calculations -print("Note: Using faster adjustment methods for large dataset...") - -# Main effect of TIME (FAST: only 2 levels) -print("Calculating TIME main effect...") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "none") # No adjustment needed for 2 levels -print(time_contrasts) - -# Main effect of DOMAIN (FAST: only 4 levels) -print("\nCalculating DOMAIN main effect...") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "tukey") # Faster than Bonferroni -print(domain_contrasts) - -# Main effect of TEMPORAL_DO (FAST: only 2 levels) -print("\nCalculating TEMPORAL_DO main effect...") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "none") # No adjustment needed for 2 levels -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS (OPTIMIZED) -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (FAST: 2×2 = 4 combinations) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -print("Calculating TEMPORAL_DO × TIME interaction...") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "none") # Only 2 levels each -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "none") # Only 2 levels each -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (SLOWER: 2×4 = 8 combinations) -print("\n=== TIME × DOMAIN INTERACTION ===") -print("Calculating TIME × DOMAIN interaction...") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "tukey") # Faster than Bonferroni -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "none") # Only 2 levels each -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (SLOWER: 2×4 = 8 combinations) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -print("Calculating TEMPORAL_DO × DOMAIN interaction...") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "tukey") # Faster than Bonferroni -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "none") # Only 2 levels each -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -print("Calculating three-way interaction...") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# For three-way interaction, we need to handle the multiple grouping variables differently -print("Three-way interaction Cohen's d calculations:") -print("Note: Cohen's d for three-way interactions requires more complex calculations") -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - diff --git a/.history/eohi1/mixed anova - domain means_20251003124539.r b/.history/eohi1/mixed anova - domain means_20251003124539.r deleted file mode 100644 index 3f04409..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003124539.r +++ /dev/null @@ -1,632 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS (OPTIMIZED) -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -print("Creating ANOVA model for post-hoc comparisons...") -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) -print("ANOVA model created successfully.") - -# OPTIMIZATION: Use faster adjustment methods and limit calculations -print("Note: Using faster adjustment methods for large dataset...") - -# Main effect of TIME (FAST: only 2 levels) -print("Calculating TIME main effect...") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "none") # No adjustment needed for 2 levels -print(time_contrasts) - -# Main effect of DOMAIN (FAST: only 4 levels) -print("\nCalculating DOMAIN main effect...") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "tukey") # Faster than Bonferroni -print(domain_contrasts) - -# Main effect of TEMPORAL_DO (FAST: only 2 levels) -print("\nCalculating TEMPORAL_DO main effect...") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "none") # No adjustment needed for 2 levels -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS (OPTIMIZED) -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (FAST: 2×2 = 4 combinations) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -print("Calculating TEMPORAL_DO × TIME interaction...") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "none") # Only 2 levels each -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "none") # Only 2 levels each -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (SLOWER: 2×4 = 8 combinations) -print("\n=== TIME × DOMAIN INTERACTION ===") -print("Calculating TIME × DOMAIN interaction...") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "tukey") # Faster than Bonferroni -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "none") # Only 2 levels each -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (SLOWER: 2×4 = 8 combinations) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -print("Calculating TEMPORAL_DO × DOMAIN interaction...") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "tukey") # Faster than Bonferroni -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "none") # Only 2 levels each -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SKIP - TOO SLOW) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("SKIPPING three-way interaction calculations due to computational intensity") -print("(2×2×4 = 16 combinations with 8504 observations)") -print("The three-way interaction was non-significant (p = 0.511) anyway") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# For three-way interaction, we need to handle the multiple grouping variables differently -print("Three-way interaction Cohen's d calculations:") -print("Note: Cohen's d for three-way interactions requires more complex calculations") -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - diff --git a/.history/eohi1/mixed anova - domain means_20251003124544.r b/.history/eohi1/mixed anova - domain means_20251003124544.r deleted file mode 100644 index 3f04409..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003124544.r +++ /dev/null @@ -1,632 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS (OPTIMIZED) -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -print("Creating ANOVA model for post-hoc comparisons...") -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) -print("ANOVA model created successfully.") - -# OPTIMIZATION: Use faster adjustment methods and limit calculations -print("Note: Using faster adjustment methods for large dataset...") - -# Main effect of TIME (FAST: only 2 levels) -print("Calculating TIME main effect...") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "none") # No adjustment needed for 2 levels -print(time_contrasts) - -# Main effect of DOMAIN (FAST: only 4 levels) -print("\nCalculating DOMAIN main effect...") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "tukey") # Faster than Bonferroni -print(domain_contrasts) - -# Main effect of TEMPORAL_DO (FAST: only 2 levels) -print("\nCalculating TEMPORAL_DO main effect...") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "none") # No adjustment needed for 2 levels -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS (OPTIMIZED) -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (FAST: 2×2 = 4 combinations) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -print("Calculating TEMPORAL_DO × TIME interaction...") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "none") # Only 2 levels each -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "none") # Only 2 levels each -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (SLOWER: 2×4 = 8 combinations) -print("\n=== TIME × DOMAIN INTERACTION ===") -print("Calculating TIME × DOMAIN interaction...") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "tukey") # Faster than Bonferroni -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "none") # Only 2 levels each -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (SLOWER: 2×4 = 8 combinations) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -print("Calculating TEMPORAL_DO × DOMAIN interaction...") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "tukey") # Faster than Bonferroni -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "none") # Only 2 levels each -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SKIP - TOO SLOW) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("SKIPPING three-way interaction calculations due to computational intensity") -print("(2×2×4 = 16 combinations with 8504 observations)") -print("The three-way interaction was non-significant (p = 0.511) anyway") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# For three-way interaction, we need to handle the multiple grouping variables differently -print("Three-way interaction Cohen's d calculations:") -print("Note: Cohen's d for three-way interactions requires more complex calculations") -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - diff --git a/.history/eohi1/mixed anova - domain means_20251003124545.r b/.history/eohi1/mixed anova - domain means_20251003124545.r deleted file mode 100644 index 3f04409..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003124545.r +++ /dev/null @@ -1,632 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS (OPTIMIZED) -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -print("Creating ANOVA model for post-hoc comparisons...") -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) -print("ANOVA model created successfully.") - -# OPTIMIZATION: Use faster adjustment methods and limit calculations -print("Note: Using faster adjustment methods for large dataset...") - -# Main effect of TIME (FAST: only 2 levels) -print("Calculating TIME main effect...") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "none") # No adjustment needed for 2 levels -print(time_contrasts) - -# Main effect of DOMAIN (FAST: only 4 levels) -print("\nCalculating DOMAIN main effect...") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "tukey") # Faster than Bonferroni -print(domain_contrasts) - -# Main effect of TEMPORAL_DO (FAST: only 2 levels) -print("\nCalculating TEMPORAL_DO main effect...") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "none") # No adjustment needed for 2 levels -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS (OPTIMIZED) -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (FAST: 2×2 = 4 combinations) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -print("Calculating TEMPORAL_DO × TIME interaction...") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "none") # Only 2 levels each -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "none") # Only 2 levels each -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (SLOWER: 2×4 = 8 combinations) -print("\n=== TIME × DOMAIN INTERACTION ===") -print("Calculating TIME × DOMAIN interaction...") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "tukey") # Faster than Bonferroni -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "none") # Only 2 levels each -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (SLOWER: 2×4 = 8 combinations) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -print("Calculating TEMPORAL_DO × DOMAIN interaction...") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "tukey") # Faster than Bonferroni -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "none") # Only 2 levels each -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SKIP - TOO SLOW) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("SKIPPING three-way interaction calculations due to computational intensity") -print("(2×2×4 = 16 combinations with 8504 observations)") -print("The three-way interaction was non-significant (p = 0.511) anyway") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# For three-way interaction, we need to handle the multiple grouping variables differently -print("Three-way interaction Cohen's d calculations:") -print("Note: Cohen's d for three-way interactions requires more complex calculations") -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - diff --git a/.history/eohi1/mixed anova - domain means_20251003124614.r b/.history/eohi1/mixed anova - domain means_20251003124614.r deleted file mode 100644 index f023c9d..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003124614.r +++ /dev/null @@ -1,627 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS (OPTIMIZED) -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (FAST: 2×2 = 4 combinations) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -print("Calculating TEMPORAL_DO × TIME interaction...") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "none") # Only 2 levels each -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "none") # Only 2 levels each -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (SLOWER: 2×4 = 8 combinations) -print("\n=== TIME × DOMAIN INTERACTION ===") -print("Calculating TIME × DOMAIN interaction...") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "tukey") # Faster than Bonferroni -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "none") # Only 2 levels each -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (SLOWER: 2×4 = 8 combinations) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -print("Calculating TEMPORAL_DO × DOMAIN interaction...") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "tukey") # Faster than Bonferroni -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "none") # Only 2 levels each -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SKIP - TOO SLOW) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("SKIPPING three-way interaction calculations due to computational intensity") -print("(2×2×4 = 16 combinations with 8504 observations)") -print("The three-way interaction was non-significant (p = 0.511) anyway") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# For three-way interaction, we need to handle the multiple grouping variables differently -print("Three-way interaction Cohen's d calculations:") -print("Note: Cohen's d for three-way interactions requires more complex calculations") -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - diff --git a/.history/eohi1/mixed anova - domain means_20251003124632.r b/.history/eohi1/mixed anova - domain means_20251003124632.r deleted file mode 100644 index bf4bf61..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003124632.r +++ /dev/null @@ -1,624 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SKIP - TOO SLOW) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("SKIPPING three-way interaction calculations due to computational intensity") -print("(2×2×4 = 16 combinations with 8504 observations)") -print("The three-way interaction was non-significant (p = 0.511) anyway") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# For three-way interaction, we need to handle the multiple grouping variables differently -print("Three-way interaction Cohen's d calculations:") -print("Note: Cohen's d for three-way interactions requires more complex calculations") -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - diff --git a/.history/eohi1/mixed anova - domain means_20251003124639.r b/.history/eohi1/mixed anova - domain means_20251003124639.r deleted file mode 100644 index 409f80b..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003124639.r +++ /dev/null @@ -1,629 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# For three-way interaction, we need to handle the multiple grouping variables differently -print("Three-way interaction Cohen's d calculations:") -print("Note: Cohen's d for three-way interactions requires more complex calculations") -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - diff --git a/.history/eohi1/mixed anova - domain means_20251003124643.r b/.history/eohi1/mixed anova - domain means_20251003124643.r deleted file mode 100644 index 409f80b..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003124643.r +++ /dev/null @@ -1,629 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# For three-way interaction, we need to handle the multiple grouping variables differently -print("Three-way interaction Cohen's d calculations:") -print("Note: Cohen's d for three-way interactions requires more complex calculations") -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - diff --git a/.history/eohi1/mixed anova - domain means_20251003124942.r b/.history/eohi1/mixed anova - domain means_20251003124942.r deleted file mode 100644 index b848014..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003124942.r +++ /dev/null @@ -1,671 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003124948.r b/.history/eohi1/mixed anova - domain means_20251003124948.r deleted file mode 100644 index b848014..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003124948.r +++ /dev/null @@ -1,671 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003124952.r b/.history/eohi1/mixed anova - domain means_20251003124952.r deleted file mode 100644 index b848014..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003124952.r +++ /dev/null @@ -1,671 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons of TIME (Past vs Future) within each TEMPORAL_DO × DOMAIN combination -print("\nPairwise comparisons of TIME (Past - Future) within each TEMPORAL_DO × DOMAIN combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "DOMAIN"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003125302.r b/.history/eohi1/mixed anova - domain means_20251003125302.r deleted file mode 100644 index e26328d..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003125302.r +++ /dev/null @@ -1,665 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003125335.r b/.history/eohi1/mixed anova - domain means_20251003125335.r deleted file mode 100644 index c079049..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003125335.r +++ /dev/null @@ -1,668 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003130013.r b/.history/eohi1/mixed anova - domain means_20251003130013.r deleted file mode 100644 index c079049..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003130013.r +++ /dev/null @@ -1,668 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003130111.r b/.history/eohi1/mixed anova - domain means_20251003130111.r deleted file mode 100644 index c079049..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003130111.r +++ /dev/null @@ -1,668 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003134651.r b/.history/eohi1/mixed anova - domain means_20251003134651.r deleted file mode 100644 index 8b09bf7..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003134651.r +++ /dev/null @@ -1,728 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (significant: p < 0.001) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003134713.r b/.history/eohi1/mixed anova - domain means_20251003134713.r deleted file mode 100644 index 8b09bf7..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003134713.r +++ /dev/null @@ -1,728 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (significant: p < 0.001) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003134732.r b/.history/eohi1/mixed anova - domain means_20251003134732.r deleted file mode 100644 index fa78b47..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003134732.r +++ /dev/null @@ -1,744 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # First, show main effects (no sphericity corrections needed) - main_effects <- c("TIME", "DOMAIN", "TEMPORAL_DO") - for(effect in main_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - } - - # Then show within-subjects effects with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (significant: p < 0.001) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003134803.r b/.history/eohi1/mixed anova - domain means_20251003134803.r deleted file mode 100644 index cf25c5e..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003134803.r +++ /dev/null @@ -1,766 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (significant: p < 0.001) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003134813.r b/.history/eohi1/mixed anova - domain means_20251003134813.r deleted file mode 100644 index cf25c5e..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003134813.r +++ /dev/null @@ -1,766 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (significant: p < 0.001) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003134814.r b/.history/eohi1/mixed anova - domain means_20251003134814.r deleted file mode 100644 index cf25c5e..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003134814.r +++ /dev/null @@ -1,766 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (significant: p < 0.001) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003134857.r b/.history/eohi1/mixed anova - domain means_20251003134857.r deleted file mode 100644 index cf25c5e..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003134857.r +++ /dev/null @@ -1,766 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (significant: p < 0.001) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -print(three_way_contrasts_df) - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003135506.r b/.history/eohi1/mixed anova - domain means_20251003135506.r deleted file mode 100644 index 34a6b8c..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003135506.r +++ /dev/null @@ -1,767 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (significant: p < 0.001) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -# three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -# print(three_way_contrasts_df) -print("Note: Three-way interaction was non-significant (p = 0.511), so detailed comparisons were not performed.") - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003135510.r b/.history/eohi1/mixed anova - domain means_20251003135510.r deleted file mode 100644 index dc9d744..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003135510.r +++ /dev/null @@ -1,767 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (significant: p < 0.001) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -# three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -# print(three_way_contrasts_df) -print("Note: Three-way interaction was non-significant (p = 0.511), so detailed comparisons were not performed.") - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -# significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003135514.r b/.history/eohi1/mixed anova - domain means_20251003135514.r deleted file mode 100644 index 0e59b24..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003135514.r +++ /dev/null @@ -1,767 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (significant: p < 0.001) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -# three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -# print(three_way_contrasts_df) -print("Note: Three-way interaction was non-significant (p = 0.511), so detailed comparisons were not performed.") - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -# significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(FALSE) { # Three-way interaction was non-significant, so skip this section - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003140001.r b/.history/eohi1/mixed anova - domain means_20251003140001.r deleted file mode 100644 index 0e59b24..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003140001.r +++ /dev/null @@ -1,767 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (significant: p < 0.001) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -# three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -# print(three_way_contrasts_df) -print("Note: Three-way interaction was non-significant (p = 0.511), so detailed comparisons were not performed.") - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -# significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(FALSE) { # Three-way interaction was non-significant, so skip this section - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003140127.r b/.history/eohi1/mixed anova - domain means_20251003140127.r deleted file mode 100644 index 0e59b24..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003140127.r +++ /dev/null @@ -1,767 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (significant: p < 0.001) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -# three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -# print(three_way_contrasts_df) -print("Note: Three-way interaction was non-significant (p = 0.511), so detailed comparisons were not performed.") - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -# significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(FALSE) { # Three-way interaction was non-significant, so skip this section - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003140135.r b/.history/eohi1/mixed anova - domain means_20251003140135.r deleted file mode 100644 index 0e59b24..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003140135.r +++ /dev/null @@ -1,767 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (significant: p < 0.001) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -# three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -# print(three_way_contrasts_df) -print("Note: Three-way interaction was non-significant (p = 0.511), so detailed comparisons were not performed.") - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -# significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(FALSE) { # Three-way interaction was non-significant, so skip this section - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003140812.r b/.history/eohi1/mixed anova - domain means_20251003140812.r deleted file mode 100644 index c588532..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003140812.r +++ /dev/null @@ -1,865 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (significant: p < 0.001) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# INTERACTION PLOT -# ============================================================================= - -print("\n=== CREATING INTERACTION PLOT ===") - -# Load ggplot2 for plotting (if not already loaded) -library(ggplot2) - -# Define color palette for DOMAIN (4 levels) -cbp1 <- c("#648FFF", "#DC267F", "#FFB000", "#FE6100", "#785EF0") - -# Define reversed levels for TIME -reversed_levels <- rev(sort(unique(long_data_clean$TIME))) - -# Prepare raw data with reversed TIME levels -iPlot <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, TEMPORAL_DO, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN), - TIME = factor(TIME, levels = reversed_levels), - TEMPORAL_DO = factor(TEMPORAL_DO) - ) - -# Create estimated marginal means for the interaction plot -emm_full <- emmeans(aov_model, ~ DOMAIN | TIME * TEMPORAL_DO) - -# Convert EMMs to data frame and prepare for plotting -emmeans_data2 <- emm_full %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = reversed_levels), - TEMPORAL_DO = factor(TEMPORAL_DO) - ) - -# Create the interaction plot -interaction_plot2 <- ggplot() + - # Raw data: regular circles, color only - geom_point( - data = iPlot, - aes(x = TIME, y = MEAN_DIFFERENCE, color = DOMAIN), - position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.2), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_data2, - aes( - xmin = as.numeric(TIME) - 0.1 + (as.numeric(DOMAIN) - 2.5) * 0.3, - xmax = as.numeric(TIME) + 0.1 + (as.numeric(DOMAIN) - 2.5) * 0.3, - ymin = ci_lower, ymax = ci_upper, - fill = DOMAIN - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.3, - xend = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.3, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - # EMMs: bold points, distinctive by color and shape - geom_point( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.3, - y = plot_mean, - color = DOMAIN, - shape = DOMAIN - ), - size = 3, stroke = 1, fill = "black" - ) + - facet_wrap(~ TEMPORAL_DO, ncol = 2) + - labs( - x = "TIME", y = "Mean Difference", - title = "DOMAIN × TIME Interaction by TEMPORAL_DO", subtitle = "" - ) + - scale_color_manual(name = "DOMAIN", values = cbp1) + - scale_fill_manual(name = "DOMAIN", values = cbp1) + - scale_shape_manual(name = "DOMAIN", values = c(21, 22, 23, 24)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5), - plot.subtitle = element_text(size = 12, hjust = 0.5) - ) - -print(interaction_plot2) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -# three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -# print(three_way_contrasts_df) -print("Note: Three-way interaction was non-significant (p = 0.511), so detailed comparisons were not performed.") - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -# significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(FALSE) { # Three-way interaction was non-significant, so skip this section - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003140821.r b/.history/eohi1/mixed anova - domain means_20251003140821.r deleted file mode 100644 index c588532..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003140821.r +++ /dev/null @@ -1,865 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (significant: p < 0.001) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# INTERACTION PLOT -# ============================================================================= - -print("\n=== CREATING INTERACTION PLOT ===") - -# Load ggplot2 for plotting (if not already loaded) -library(ggplot2) - -# Define color palette for DOMAIN (4 levels) -cbp1 <- c("#648FFF", "#DC267F", "#FFB000", "#FE6100", "#785EF0") - -# Define reversed levels for TIME -reversed_levels <- rev(sort(unique(long_data_clean$TIME))) - -# Prepare raw data with reversed TIME levels -iPlot <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, TEMPORAL_DO, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN), - TIME = factor(TIME, levels = reversed_levels), - TEMPORAL_DO = factor(TEMPORAL_DO) - ) - -# Create estimated marginal means for the interaction plot -emm_full <- emmeans(aov_model, ~ DOMAIN | TIME * TEMPORAL_DO) - -# Convert EMMs to data frame and prepare for plotting -emmeans_data2 <- emm_full %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = reversed_levels), - TEMPORAL_DO = factor(TEMPORAL_DO) - ) - -# Create the interaction plot -interaction_plot2 <- ggplot() + - # Raw data: regular circles, color only - geom_point( - data = iPlot, - aes(x = TIME, y = MEAN_DIFFERENCE, color = DOMAIN), - position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.2), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_data2, - aes( - xmin = as.numeric(TIME) - 0.1 + (as.numeric(DOMAIN) - 2.5) * 0.3, - xmax = as.numeric(TIME) + 0.1 + (as.numeric(DOMAIN) - 2.5) * 0.3, - ymin = ci_lower, ymax = ci_upper, - fill = DOMAIN - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.3, - xend = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.3, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - # EMMs: bold points, distinctive by color and shape - geom_point( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.3, - y = plot_mean, - color = DOMAIN, - shape = DOMAIN - ), - size = 3, stroke = 1, fill = "black" - ) + - facet_wrap(~ TEMPORAL_DO, ncol = 2) + - labs( - x = "TIME", y = "Mean Difference", - title = "DOMAIN × TIME Interaction by TEMPORAL_DO", subtitle = "" - ) + - scale_color_manual(name = "DOMAIN", values = cbp1) + - scale_fill_manual(name = "DOMAIN", values = cbp1) + - scale_shape_manual(name = "DOMAIN", values = c(21, 22, 23, 24)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5), - plot.subtitle = element_text(size = 12, hjust = 0.5) - ) - -print(interaction_plot2) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -# three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -# print(three_way_contrasts_df) -print("Note: Three-way interaction was non-significant (p = 0.511), so detailed comparisons were not performed.") - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -# significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(FALSE) { # Three-way interaction was non-significant, so skip this section - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003141902.r b/.history/eohi1/mixed anova - domain means_20251003141902.r deleted file mode 100644 index 0e59b24..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003141902.r +++ /dev/null @@ -1,767 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (significant: p < 0.001) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -# three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -# print(three_way_contrasts_df) -print("Note: Three-way interaction was non-significant (p = 0.511), so detailed comparisons were not performed.") - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -# significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(FALSE) { # Three-way interaction was non-significant, so skip this section - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003141926.r b/.history/eohi1/mixed anova - domain means_20251003141926.r deleted file mode 100644 index c588532..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003141926.r +++ /dev/null @@ -1,865 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (significant: p < 0.001) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# INTERACTION PLOT -# ============================================================================= - -print("\n=== CREATING INTERACTION PLOT ===") - -# Load ggplot2 for plotting (if not already loaded) -library(ggplot2) - -# Define color palette for DOMAIN (4 levels) -cbp1 <- c("#648FFF", "#DC267F", "#FFB000", "#FE6100", "#785EF0") - -# Define reversed levels for TIME -reversed_levels <- rev(sort(unique(long_data_clean$TIME))) - -# Prepare raw data with reversed TIME levels -iPlot <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, TEMPORAL_DO, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN), - TIME = factor(TIME, levels = reversed_levels), - TEMPORAL_DO = factor(TEMPORAL_DO) - ) - -# Create estimated marginal means for the interaction plot -emm_full <- emmeans(aov_model, ~ DOMAIN | TIME * TEMPORAL_DO) - -# Convert EMMs to data frame and prepare for plotting -emmeans_data2 <- emm_full %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = reversed_levels), - TEMPORAL_DO = factor(TEMPORAL_DO) - ) - -# Create the interaction plot -interaction_plot2 <- ggplot() + - # Raw data: regular circles, color only - geom_point( - data = iPlot, - aes(x = TIME, y = MEAN_DIFFERENCE, color = DOMAIN), - position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.2), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_data2, - aes( - xmin = as.numeric(TIME) - 0.1 + (as.numeric(DOMAIN) - 2.5) * 0.3, - xmax = as.numeric(TIME) + 0.1 + (as.numeric(DOMAIN) - 2.5) * 0.3, - ymin = ci_lower, ymax = ci_upper, - fill = DOMAIN - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.3, - xend = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.3, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - # EMMs: bold points, distinctive by color and shape - geom_point( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.3, - y = plot_mean, - color = DOMAIN, - shape = DOMAIN - ), - size = 3, stroke = 1, fill = "black" - ) + - facet_wrap(~ TEMPORAL_DO, ncol = 2) + - labs( - x = "TIME", y = "Mean Difference", - title = "DOMAIN × TIME Interaction by TEMPORAL_DO", subtitle = "" - ) + - scale_color_manual(name = "DOMAIN", values = cbp1) + - scale_fill_manual(name = "DOMAIN", values = cbp1) + - scale_shape_manual(name = "DOMAIN", values = c(21, 22, 23, 24)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5), - plot.subtitle = element_text(size = 12, hjust = 0.5) - ) - -print(interaction_plot2) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -# three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -# print(three_way_contrasts_df) -print("Note: Three-way interaction was non-significant (p = 0.511), so detailed comparisons were not performed.") - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -# significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(FALSE) { # Three-way interaction was non-significant, so skip this section - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003142537.r b/.history/eohi1/mixed anova - domain means_20251003142537.r deleted file mode 100644 index 55f6424..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003142537.r +++ /dev/null @@ -1,865 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (significant: p < 0.001) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# INTERACTION PLOT -# ============================================================================= - -print("\n=== CREATING INTERACTION PLOT ===") - -# Load ggplot2 for plotting (if not already loaded) -library(ggplot2) - -# Define color palette for DOMAIN (4 levels) -cbp1 <- c("#648FFF", "#DC267F", "#FFB000", "#FE6100", "#785EF0") - -# Define TIME levels (no reversal) -time_levels <- sort(unique(long_data_clean$TIME)) - -# Prepare raw data with standard TIME levels -iPlot <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, TEMPORAL_DO, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN), - TIME = factor(TIME, levels = time_levels), - TEMPORAL_DO = factor(TEMPORAL_DO) - ) - -# Create estimated marginal means for the interaction plot -emm_full <- emmeans(aov_model, ~ DOMAIN | TIME * TEMPORAL_DO) - -# Convert EMMs to data frame and prepare for plotting -emmeans_data2 <- emm_full %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = reversed_levels), - TEMPORAL_DO = factor(TEMPORAL_DO) - ) - -# Create the interaction plot -interaction_plot2 <- ggplot() + - # Raw data: regular circles, color only - geom_point( - data = iPlot, - aes(x = TIME, y = MEAN_DIFFERENCE, color = DOMAIN), - position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.2), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_data2, - aes( - xmin = as.numeric(TIME) - 0.1 + (as.numeric(DOMAIN) - 2.5) * 0.3, - xmax = as.numeric(TIME) + 0.1 + (as.numeric(DOMAIN) - 2.5) * 0.3, - ymin = ci_lower, ymax = ci_upper, - fill = DOMAIN - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.3, - xend = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.3, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - # EMMs: bold points, distinctive by color and shape - geom_point( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.3, - y = plot_mean, - color = DOMAIN, - shape = DOMAIN - ), - size = 3, stroke = 1, fill = "black" - ) + - facet_wrap(~ TEMPORAL_DO, ncol = 2) + - labs( - x = "TIME", y = "Mean Difference", - title = "DOMAIN × TIME Interaction by TEMPORAL_DO", subtitle = "" - ) + - scale_color_manual(name = "DOMAIN", values = cbp1) + - scale_fill_manual(name = "DOMAIN", values = cbp1) + - scale_shape_manual(name = "DOMAIN", values = c(21, 22, 23, 24)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5), - plot.subtitle = element_text(size = 12, hjust = 0.5) - ) - -print(interaction_plot2) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -# three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -# print(three_way_contrasts_df) -print("Note: Three-way interaction was non-significant (p = 0.511), so detailed comparisons were not performed.") - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -# significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(FALSE) { # Three-way interaction was non-significant, so skip this section - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003142630.r b/.history/eohi1/mixed anova - domain means_20251003142630.r deleted file mode 100644 index 03b5050..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003142630.r +++ /dev/null @@ -1,865 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (significant: p < 0.001) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# INTERACTION PLOT -# ============================================================================= - -print("\n=== CREATING INTERACTION PLOT ===") - -# Load ggplot2 for plotting (if not already loaded) -library(ggplot2) - -# Define color palette for DOMAIN (4 levels) -cbp1 <- c("#648FFF", "#DC267F", "#FFB000", "#FE6100", "#785EF0") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# Prepare raw data with standard TIME levels -iPlot <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, TEMPORAL_DO, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN), - TIME = factor(TIME, levels = time_levels), - TEMPORAL_DO = factor(TEMPORAL_DO) - ) - -# Create estimated marginal means for the interaction plot -emm_full <- emmeans(aov_model, ~ DOMAIN | TIME * TEMPORAL_DO) - -# Convert EMMs to data frame and prepare for plotting -emmeans_data2 <- emm_full %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - TEMPORAL_DO = factor(TEMPORAL_DO) - ) - -# Create the interaction plot -interaction_plot2 <- ggplot() + - # Raw data: regular circles, color only - geom_point( - data = iPlot, - aes(x = TIME, y = MEAN_DIFFERENCE, color = DOMAIN), - position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.2), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_data2, - aes( - xmin = as.numeric(TIME) - 0.15 + (as.numeric(DOMAIN) - 2.5) * 0.25, - xmax = as.numeric(TIME) + 0.15 + (as.numeric(DOMAIN) - 2.5) * 0.25, - ymin = ci_lower, ymax = ci_upper, - fill = DOMAIN - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.25, - xend = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.25, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - # EMMs: bold points, distinctive by color and shape - geom_point( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.25, - y = plot_mean, - color = DOMAIN, - shape = DOMAIN - ), - size = 1.5, stroke = 0.5, fill = "black" - ) + - facet_wrap(~ TEMPORAL_DO, ncol = 2) + - labs( - x = "TIME", y = "Mean Difference", - title = "DOMAIN × TIME Interaction by TEMPORAL_DO", subtitle = "" - ) + - scale_color_manual(name = "DOMAIN", values = cbp1) + - scale_fill_manual(name = "DOMAIN", values = cbp1) + - scale_shape_manual(name = "DOMAIN", values = c(21, 22, 23, 24)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5), - plot.subtitle = element_text(size = 12, hjust = 0.5) - ) - -print(interaction_plot2) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -# three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -# print(three_way_contrasts_df) -print("Note: Three-way interaction was non-significant (p = 0.511), so detailed comparisons were not performed.") - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -# significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(FALSE) { # Three-way interaction was non-significant, so skip this section - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003142635.r b/.history/eohi1/mixed anova - domain means_20251003142635.r deleted file mode 100644 index 03b5050..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003142635.r +++ /dev/null @@ -1,865 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (significant: p < 0.001) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# INTERACTION PLOT -# ============================================================================= - -print("\n=== CREATING INTERACTION PLOT ===") - -# Load ggplot2 for plotting (if not already loaded) -library(ggplot2) - -# Define color palette for DOMAIN (4 levels) -cbp1 <- c("#648FFF", "#DC267F", "#FFB000", "#FE6100", "#785EF0") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# Prepare raw data with standard TIME levels -iPlot <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, TEMPORAL_DO, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN), - TIME = factor(TIME, levels = time_levels), - TEMPORAL_DO = factor(TEMPORAL_DO) - ) - -# Create estimated marginal means for the interaction plot -emm_full <- emmeans(aov_model, ~ DOMAIN | TIME * TEMPORAL_DO) - -# Convert EMMs to data frame and prepare for plotting -emmeans_data2 <- emm_full %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - TEMPORAL_DO = factor(TEMPORAL_DO) - ) - -# Create the interaction plot -interaction_plot2 <- ggplot() + - # Raw data: regular circles, color only - geom_point( - data = iPlot, - aes(x = TIME, y = MEAN_DIFFERENCE, color = DOMAIN), - position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.2), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_data2, - aes( - xmin = as.numeric(TIME) - 0.15 + (as.numeric(DOMAIN) - 2.5) * 0.25, - xmax = as.numeric(TIME) + 0.15 + (as.numeric(DOMAIN) - 2.5) * 0.25, - ymin = ci_lower, ymax = ci_upper, - fill = DOMAIN - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.25, - xend = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.25, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - # EMMs: bold points, distinctive by color and shape - geom_point( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.25, - y = plot_mean, - color = DOMAIN, - shape = DOMAIN - ), - size = 1.5, stroke = 0.5, fill = "black" - ) + - facet_wrap(~ TEMPORAL_DO, ncol = 2) + - labs( - x = "TIME", y = "Mean Difference", - title = "DOMAIN × TIME Interaction by TEMPORAL_DO", subtitle = "" - ) + - scale_color_manual(name = "DOMAIN", values = cbp1) + - scale_fill_manual(name = "DOMAIN", values = cbp1) + - scale_shape_manual(name = "DOMAIN", values = c(21, 22, 23, 24)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5), - plot.subtitle = element_text(size = 12, hjust = 0.5) - ) - -print(interaction_plot2) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -# three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -# print(three_way_contrasts_df) -print("Note: Three-way interaction was non-significant (p = 0.511), so detailed comparisons were not performed.") - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -# significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(FALSE) { # Three-way interaction was non-significant, so skip this section - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003142645.r b/.history/eohi1/mixed anova - domain means_20251003142645.r deleted file mode 100644 index 03b5050..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003142645.r +++ /dev/null @@ -1,865 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (significant: p < 0.001) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# INTERACTION PLOT -# ============================================================================= - -print("\n=== CREATING INTERACTION PLOT ===") - -# Load ggplot2 for plotting (if not already loaded) -library(ggplot2) - -# Define color palette for DOMAIN (4 levels) -cbp1 <- c("#648FFF", "#DC267F", "#FFB000", "#FE6100", "#785EF0") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# Prepare raw data with standard TIME levels -iPlot <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, TEMPORAL_DO, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN), - TIME = factor(TIME, levels = time_levels), - TEMPORAL_DO = factor(TEMPORAL_DO) - ) - -# Create estimated marginal means for the interaction plot -emm_full <- emmeans(aov_model, ~ DOMAIN | TIME * TEMPORAL_DO) - -# Convert EMMs to data frame and prepare for plotting -emmeans_data2 <- emm_full %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - TEMPORAL_DO = factor(TEMPORAL_DO) - ) - -# Create the interaction plot -interaction_plot2 <- ggplot() + - # Raw data: regular circles, color only - geom_point( - data = iPlot, - aes(x = TIME, y = MEAN_DIFFERENCE, color = DOMAIN), - position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.2), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_data2, - aes( - xmin = as.numeric(TIME) - 0.15 + (as.numeric(DOMAIN) - 2.5) * 0.25, - xmax = as.numeric(TIME) + 0.15 + (as.numeric(DOMAIN) - 2.5) * 0.25, - ymin = ci_lower, ymax = ci_upper, - fill = DOMAIN - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.25, - xend = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.25, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - # EMMs: bold points, distinctive by color and shape - geom_point( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.25, - y = plot_mean, - color = DOMAIN, - shape = DOMAIN - ), - size = 1.5, stroke = 0.5, fill = "black" - ) + - facet_wrap(~ TEMPORAL_DO, ncol = 2) + - labs( - x = "TIME", y = "Mean Difference", - title = "DOMAIN × TIME Interaction by TEMPORAL_DO", subtitle = "" - ) + - scale_color_manual(name = "DOMAIN", values = cbp1) + - scale_fill_manual(name = "DOMAIN", values = cbp1) + - scale_shape_manual(name = "DOMAIN", values = c(21, 22, 23, 24)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5), - plot.subtitle = element_text(size = 12, hjust = 0.5) - ) - -print(interaction_plot2) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -# three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -# print(three_way_contrasts_df) -print("Note: Three-way interaction was non-significant (p = 0.511), so detailed comparisons were not performed.") - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -# significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(FALSE) { # Three-way interaction was non-significant, so skip this section - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003142752.r b/.history/eohi1/mixed anova - domain means_20251003142752.r deleted file mode 100644 index 36ef388..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003142752.r +++ /dev/null @@ -1,863 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (significant: p < 0.001) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# INTERACTION PLOT -# ============================================================================= - -# Load ggplot2 for plotting (if not already loaded) -library(ggplot2) - -# Define color palette for DOMAIN (4 levels) -cbp1 <- c("#648FFF", "#DC267F", "#FFB000", "#FE6100", "#785EF0") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# Prepare raw data with standard TIME levels -iPlot <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, TEMPORAL_DO, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN), - TIME = factor(TIME, levels = time_levels), - TEMPORAL_DO = factor(TEMPORAL_DO) - ) - -# Create estimated marginal means for the interaction plot -emm_full <- emmeans(aov_model, ~ DOMAIN | TIME * TEMPORAL_DO) - -# Convert EMMs to data frame and prepare for plotting -emmeans_data2 <- emm_full %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - TEMPORAL_DO = factor(TEMPORAL_DO) - ) - -# Create the interaction plot -interaction_plot2 <- ggplot() + - # Raw data: regular circles, color only - geom_point( - data = iPlot, - aes(x = TIME, y = MEAN_DIFFERENCE, color = DOMAIN), - position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.2), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_data2, - aes( - xmin = as.numeric(TIME) - 0.15 + (as.numeric(DOMAIN) - 2.5) * 0.25, - xmax = as.numeric(TIME) + 0.15 + (as.numeric(DOMAIN) - 2.5) * 0.25, - ymin = ci_lower, ymax = ci_upper, - fill = DOMAIN - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.25, - xend = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.25, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - # EMMs: bold points, distinctive by color and shape - geom_point( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.25, - y = plot_mean, - color = DOMAIN, - shape = DOMAIN - ), - size = 1.5, stroke = 0.5, fill = "black" - ) + - facet_wrap(~ TEMPORAL_DO, ncol = 2) + - labs( - x = "TIME", y = "Mean Difference", - title = "DOMAIN × TIME Interaction by TEMPORAL_DO", subtitle = "" - ) + - scale_color_manual(name = "DOMAIN", values = cbp1) + - scale_fill_manual(name = "DOMAIN", values = cbp1) + - scale_shape_manual(name = "DOMAIN", values = c(21, 22, 23, 24)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5), - plot.subtitle = element_text(size = 12, hjust = 0.5) - ) - -print(interaction_plot2) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -# three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -# print(three_way_contrasts_df) -print("Note: Three-way interaction was non-significant (p = 0.511), so detailed comparisons were not performed.") - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -# significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(FALSE) { # Three-way interaction was non-significant, so skip this section - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003142806.r b/.history/eohi1/mixed anova - domain means_20251003142806.r deleted file mode 100644 index 492ef88..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003142806.r +++ /dev/null @@ -1,863 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (significant: p < 0.001) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# INTERACTION PLOT -# ============================================================================= - -# Load ggplot2 for plotting (if not already loaded) -library(ggplot2) - -# Define color palette for DOMAIN (4 levels) -cbp1 <- c("#648FFF", "#DC267F", "#FFB000", "#FE6100", "#785EF0") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# Prepare raw data with standard TIME levels -iPlot <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, TEMPORAL_DO, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN), - TIME = factor(TIME, levels = time_levels), - TEMPORAL_DO = factor(TEMPORAL_DO) - ) - -# Create estimated marginal means for the interaction plot -emm_full <- emmeans(aov_model, ~ DOMAIN | TIME * TEMPORAL_DO) - -# Convert EMMs to data frame and prepare for plotting -emmeans_data2 <- emm_full %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - TEMPORAL_DO = factor(TEMPORAL_DO) - ) - -# Create the interaction plot -interaction_plot2 <- ggplot() + - # Raw data: regular circles, color only - geom_point( - data = iPlot, - aes(x = TIME, y = MEAN_DIFFERENCE, color = DOMAIN), - position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.2), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_data2, - aes( - xmin = as.numeric(TIME) - 0.08 + (as.numeric(DOMAIN) - 2.5) * 0.15, - xmax = as.numeric(TIME) + 0.08 + (as.numeric(DOMAIN) - 2.5) * 0.15, - ymin = ci_lower, ymax = ci_upper, - fill = DOMAIN - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - xend = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - # EMMs: bold points, distinctive by color and shape - geom_point( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - y = plot_mean, - color = DOMAIN, - shape = DOMAIN - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - facet_wrap(~ TEMPORAL_DO, ncol = 2) + - labs( - x = "TIME", y = "Mean Difference", - title = "DOMAIN × TIME Interaction by TEMPORAL_DO", subtitle = "" - ) + - scale_color_manual(name = "DOMAIN", values = cbp1) + - scale_fill_manual(name = "DOMAIN", values = cbp1) + - scale_shape_manual(name = "DOMAIN", values = c(21, 22, 23, 24)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5), - plot.subtitle = element_text(size = 12, hjust = 0.5) - ) - -print(interaction_plot2) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -# three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -# print(three_way_contrasts_df) -print("Note: Three-way interaction was non-significant (p = 0.511), so detailed comparisons were not performed.") - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -# significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(FALSE) { # Three-way interaction was non-significant, so skip this section - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003142812.r b/.history/eohi1/mixed anova - domain means_20251003142812.r deleted file mode 100644 index 492ef88..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003142812.r +++ /dev/null @@ -1,863 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (significant: p < 0.001) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# INTERACTION PLOT -# ============================================================================= - -# Load ggplot2 for plotting (if not already loaded) -library(ggplot2) - -# Define color palette for DOMAIN (4 levels) -cbp1 <- c("#648FFF", "#DC267F", "#FFB000", "#FE6100", "#785EF0") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# Prepare raw data with standard TIME levels -iPlot <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, TEMPORAL_DO, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN), - TIME = factor(TIME, levels = time_levels), - TEMPORAL_DO = factor(TEMPORAL_DO) - ) - -# Create estimated marginal means for the interaction plot -emm_full <- emmeans(aov_model, ~ DOMAIN | TIME * TEMPORAL_DO) - -# Convert EMMs to data frame and prepare for plotting -emmeans_data2 <- emm_full %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - TEMPORAL_DO = factor(TEMPORAL_DO) - ) - -# Create the interaction plot -interaction_plot2 <- ggplot() + - # Raw data: regular circles, color only - geom_point( - data = iPlot, - aes(x = TIME, y = MEAN_DIFFERENCE, color = DOMAIN), - position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.2), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_data2, - aes( - xmin = as.numeric(TIME) - 0.08 + (as.numeric(DOMAIN) - 2.5) * 0.15, - xmax = as.numeric(TIME) + 0.08 + (as.numeric(DOMAIN) - 2.5) * 0.15, - ymin = ci_lower, ymax = ci_upper, - fill = DOMAIN - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - xend = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - # EMMs: bold points, distinctive by color and shape - geom_point( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - y = plot_mean, - color = DOMAIN, - shape = DOMAIN - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - facet_wrap(~ TEMPORAL_DO, ncol = 2) + - labs( - x = "TIME", y = "Mean Difference", - title = "DOMAIN × TIME Interaction by TEMPORAL_DO", subtitle = "" - ) + - scale_color_manual(name = "DOMAIN", values = cbp1) + - scale_fill_manual(name = "DOMAIN", values = cbp1) + - scale_shape_manual(name = "DOMAIN", values = c(21, 22, 23, 24)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5), - plot.subtitle = element_text(size = 12, hjust = 0.5) - ) - -print(interaction_plot2) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -# three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -# print(three_way_contrasts_df) -print("Note: Three-way interaction was non-significant (p = 0.511), so detailed comparisons were not performed.") - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -# significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(FALSE) { # Three-way interaction was non-significant, so skip this section - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251003143103.r b/.history/eohi1/mixed anova - domain means_20251003143103.r deleted file mode 100644 index 492ef88..0000000 --- a/.history/eohi1/mixed anova - domain means_20251003143103.r +++ /dev/null @@ -1,863 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -# Domain mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSIS (SIMPLIFIED) -# ============================================================================= - -print("\n=== THREE-WAY INTERACTION ANALYSIS ===") -print("Note: Three-way interaction was non-significant (p = 0.511)") -print("Skipping detailed three-way comparisons due to computational intensity") -print("Focus on the significant two-way interactions above.") - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (significant: p < 0.001) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# INTERACTION PLOT -# ============================================================================= - -# Load ggplot2 for plotting (if not already loaded) -library(ggplot2) - -# Define color palette for DOMAIN (4 levels) -cbp1 <- c("#648FFF", "#DC267F", "#FFB000", "#FE6100", "#785EF0") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# Prepare raw data with standard TIME levels -iPlot <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, TEMPORAL_DO, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN), - TIME = factor(TIME, levels = time_levels), - TEMPORAL_DO = factor(TEMPORAL_DO) - ) - -# Create estimated marginal means for the interaction plot -emm_full <- emmeans(aov_model, ~ DOMAIN | TIME * TEMPORAL_DO) - -# Convert EMMs to data frame and prepare for plotting -emmeans_data2 <- emm_full %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - TEMPORAL_DO = factor(TEMPORAL_DO) - ) - -# Create the interaction plot -interaction_plot2 <- ggplot() + - # Raw data: regular circles, color only - geom_point( - data = iPlot, - aes(x = TIME, y = MEAN_DIFFERENCE, color = DOMAIN), - position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.2), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_data2, - aes( - xmin = as.numeric(TIME) - 0.08 + (as.numeric(DOMAIN) - 2.5) * 0.15, - xmax = as.numeric(TIME) + 0.08 + (as.numeric(DOMAIN) - 2.5) * 0.15, - ymin = ci_lower, ymax = ci_upper, - fill = DOMAIN - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - xend = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - # EMMs: bold points, distinctive by color and shape - geom_point( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - y = plot_mean, - color = DOMAIN, - shape = DOMAIN - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - facet_wrap(~ TEMPORAL_DO, ncol = 2) + - labs( - x = "TIME", y = "Mean Difference", - title = "DOMAIN × TIME Interaction by TEMPORAL_DO", subtitle = "" - ) + - scale_color_manual(name = "DOMAIN", values = cbp1) + - scale_fill_manual(name = "DOMAIN", values = cbp1) + - scale_shape_manual(name = "DOMAIN", values = c(21, 22, 23, 24)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5), - plot.subtitle = element_text(size = 12, hjust = 0.5) - ) - -print(interaction_plot2) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# ============================================================================= -# 4. THREE-WAY INTERACTION COHEN'S D -# ============================================================================= - -print("\n=== COHEN'S D FOR THREE-WAY INTERACTION ===") - -# Get pairwise comparisons for the three-way interaction -# three_way_contrasts_df <- as.data.frame(three_way_contrasts) -# print("The pairwise comparisons show the TIME effects within each TEMPORAL_DO × DOMAIN combination:") -# print(three_way_contrasts_df) -print("Note: Three-way interaction was non-significant (p = 0.511), so detailed comparisons were not performed.") - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") - -# Calculate Cohen's d for significant three-way interaction effects -print("\nCohen's d calculations for significant TIME effects within each TEMPORAL_DO × DOMAIN combination:") - -# Extract significant comparisons (p < 0.05) -# significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(FALSE) { # Three-way interaction was non-significant, so skip this section - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this TEMPORAL_DO × DOMAIN combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("TEMPORAL_DO = %s, DOMAIN = %s:\n", temporal_do_level, domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any TEMPORAL_DO × DOMAIN combination.\n") -} - diff --git a/.history/eohi1/mixed anova - domain means_20251004194541.r b/.history/eohi1/mixed anova - domain means_20251004194541.r deleted file mode 100644 index ec09360..0000000 --- a/.history/eohi1/mixed anova - domain means_20251004194541.r +++ /dev/null @@ -1,578 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOT - -# Define color palette for DOMAIN (4 levels) -cbp1 <- c("#648FFF", "#DC267F", "#FFB000", "#FE6100", "#785EF0") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") -# Create estimated marginal means for DOMAIN x TIME -emm_full <- emmeans(aov_model, ~ DOMAIN * TIME) - -# Prepare emmeans data frame as before -emmeans_data2 <- emm_full %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -iPlot <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Plot without TEMPORAL_DO facet -interaction_plot2 <- ggplot() + - geom_point( - data = iPlot, - aes(x = TIME, y = MEAN_DIFFERENCE, color = DOMAIN), - position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.2), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_data2, - aes( - xmin = as.numeric(TIME) - 0.08 + (as.numeric(DOMAIN) - 2.5) * 0.15, - xmax = as.numeric(TIME) + 0.08 + (as.numeric(DOMAIN) - 2.5) * 0.15, - ymin = ci_lower, ymax = ci_upper, - fill = DOMAIN - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - xend = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - y = plot_mean, - color = DOMAIN, - shape = DOMAIN - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "TIME", y = "Mean Difference", - title = "DOMAIN × TIME Interaction", subtitle = "" - ) + - scale_color_manual(name = "DOMAIN", values = cbp1) + - scale_fill_manual(name = "DOMAIN", values = cbp1) + - scale_shape_manual(name = "DOMAIN", values = c(21, 22, 23, 24)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5), - plot.subtitle = element_text(size = 12, hjust = 0.5) - ) - -print(interaction_plot2) diff --git a/.history/eohi1/mixed anova - domain means_20251006125951.r b/.history/eohi1/mixed anova - domain means_20251006125951.r deleted file mode 100644 index ec09360..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006125951.r +++ /dev/null @@ -1,578 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOT - -# Define color palette for DOMAIN (4 levels) -cbp1 <- c("#648FFF", "#DC267F", "#FFB000", "#FE6100", "#785EF0") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") -# Create estimated marginal means for DOMAIN x TIME -emm_full <- emmeans(aov_model, ~ DOMAIN * TIME) - -# Prepare emmeans data frame as before -emmeans_data2 <- emm_full %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -iPlot <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Plot without TEMPORAL_DO facet -interaction_plot2 <- ggplot() + - geom_point( - data = iPlot, - aes(x = TIME, y = MEAN_DIFFERENCE, color = DOMAIN), - position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.2), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_data2, - aes( - xmin = as.numeric(TIME) - 0.08 + (as.numeric(DOMAIN) - 2.5) * 0.15, - xmax = as.numeric(TIME) + 0.08 + (as.numeric(DOMAIN) - 2.5) * 0.15, - ymin = ci_lower, ymax = ci_upper, - fill = DOMAIN - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - xend = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_data2, - aes( - x = as.numeric(TIME) + (as.numeric(DOMAIN) - 2.5) * 0.15, - y = plot_mean, - color = DOMAIN, - shape = DOMAIN - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "TIME", y = "Mean Difference", - title = "DOMAIN × TIME Interaction", subtitle = "" - ) + - scale_color_manual(name = "DOMAIN", values = cbp1) + - scale_fill_manual(name = "DOMAIN", values = cbp1) + - scale_shape_manual(name = "DOMAIN", values = c(21, 22, 23, 24)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5), - plot.subtitle = element_text(size = 12, hjust = 0.5) - ) - -print(interaction_plot2) diff --git a/.history/eohi1/mixed anova - domain means_20251006131233.r b/.history/eohi1/mixed anova - domain means_20251006131233.r deleted file mode 100644 index 0232633..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006131233.r +++ /dev/null @@ -1,666 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_point( - data = iPlot_temporal, - aes(x = TEMPORAL_DO, y = MEAN_DIFFERENCE, color = TIME), - position = position_jitterdodge(dodge.width = 0.5, jitter.width = 0.15), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_temporal_time, - aes( - xmin = as.numeric(TEMPORAL_DO) - 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - xmax = as.numeric(TEMPORAL_DO) + 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - ymin = ci_lower, ymax = ci_upper, - fill = TIME - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_temporal_time, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.25, - xend = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.25, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.25, - y = plot_mean, - color = TIME, - shape = TIME - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "TEMPORAL_DO", y = "Mean Difference", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_point( - data = iPlot_domain, - aes(x = DOMAIN, y = MEAN_DIFFERENCE, color = TIME), - position = position_jitterdodge(dodge.width = 0.5, jitter.width = 0.15), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_time_domain, - aes( - xmin = as.numeric(DOMAIN) - 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - xmax = as.numeric(DOMAIN) + 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - ymin = ci_lower, ymax = ci_upper, - fill = TIME - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - xend = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - y = plot_mean, - color = TIME, - shape = TIME - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "DOMAIN", y = "Mean Difference", - title = "TIME × DOMAIN Interaction" - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 45, hjust = 1), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_domain) diff --git a/.history/eohi1/mixed anova - domain means_20251006131245.r b/.history/eohi1/mixed anova - domain means_20251006131245.r deleted file mode 100644 index 0232633..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006131245.r +++ /dev/null @@ -1,666 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_point( - data = iPlot_temporal, - aes(x = TEMPORAL_DO, y = MEAN_DIFFERENCE, color = TIME), - position = position_jitterdodge(dodge.width = 0.5, jitter.width = 0.15), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_temporal_time, - aes( - xmin = as.numeric(TEMPORAL_DO) - 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - xmax = as.numeric(TEMPORAL_DO) + 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - ymin = ci_lower, ymax = ci_upper, - fill = TIME - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_temporal_time, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.25, - xend = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.25, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.25, - y = plot_mean, - color = TIME, - shape = TIME - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "TEMPORAL_DO", y = "Mean Difference", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_point( - data = iPlot_domain, - aes(x = DOMAIN, y = MEAN_DIFFERENCE, color = TIME), - position = position_jitterdodge(dodge.width = 0.5, jitter.width = 0.15), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_time_domain, - aes( - xmin = as.numeric(DOMAIN) - 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - xmax = as.numeric(DOMAIN) + 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - ymin = ci_lower, ymax = ci_upper, - fill = TIME - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - xend = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - y = plot_mean, - color = TIME, - shape = TIME - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "DOMAIN", y = "Mean Difference", - title = "TIME × DOMAIN Interaction" - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 45, hjust = 1), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_domain) diff --git a/.history/eohi1/mixed anova - domain means_20251006142529.r b/.history/eohi1/mixed anova - domain means_20251006142529.r deleted file mode 100644 index 0232633..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006142529.r +++ /dev/null @@ -1,666 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_point( - data = iPlot_temporal, - aes(x = TEMPORAL_DO, y = MEAN_DIFFERENCE, color = TIME), - position = position_jitterdodge(dodge.width = 0.5, jitter.width = 0.15), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_temporal_time, - aes( - xmin = as.numeric(TEMPORAL_DO) - 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - xmax = as.numeric(TEMPORAL_DO) + 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - ymin = ci_lower, ymax = ci_upper, - fill = TIME - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_temporal_time, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.25, - xend = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.25, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.25, - y = plot_mean, - color = TIME, - shape = TIME - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "TEMPORAL_DO", y = "Mean Difference", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_point( - data = iPlot_domain, - aes(x = DOMAIN, y = MEAN_DIFFERENCE, color = TIME), - position = position_jitterdodge(dodge.width = 0.5, jitter.width = 0.15), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_time_domain, - aes( - xmin = as.numeric(DOMAIN) - 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - xmax = as.numeric(DOMAIN) + 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - ymin = ci_lower, ymax = ci_upper, - fill = TIME - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - xend = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - y = plot_mean, - color = TIME, - shape = TIME - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "DOMAIN", y = "Mean Difference", - title = "TIME × DOMAIN Interaction" - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 45, hjust = 1), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_domain) diff --git a/.history/eohi1/mixed anova - domain means_20251006142637.r b/.history/eohi1/mixed anova - domain means_20251006142637.r deleted file mode 100644 index e840311..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006142637.r +++ /dev/null @@ -1,669 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_point( - data = iPlot_temporal, - aes(x = TEMPORAL_DO, y = MEAN_DIFFERENCE, color = TIME), - position = position_jitterdodge(dodge.width = 0.5, jitter.width = 0.15), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_temporal_time, - aes( - xmin = as.numeric(TEMPORAL_DO) - 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - xmax = as.numeric(TEMPORAL_DO) + 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - ymin = ci_lower, ymax = ci_upper, - fill = TIME - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_temporal_time, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.25, - xend = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.25, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.25, - y = plot_mean, - color = TIME, - shape = TIME - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "TEMPORAL_DO", y = "Mean Difference", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_point( - data = iPlot_domain, - aes(x = DOMAIN, y = MEAN_DIFFERENCE, color = TIME), - position = position_jitterdodge(dodge.width = 0.5, jitter.width = 0.15), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_time_domain, - aes( - xmin = as.numeric(DOMAIN) - 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - xmax = as.numeric(DOMAIN) + 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - ymin = ci_lower, ymax = ci_upper, - fill = TIME - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - xend = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - y = plot_mean, - color = TIME, - shape = TIME - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "DOMAIN", y = "Mean Difference", - title = "TIME × DOMAIN Interaction" - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 45, hjust = 1), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_domain) diff --git a/.history/eohi1/mixed anova - domain means_20251006142646.r b/.history/eohi1/mixed anova - domain means_20251006142646.r deleted file mode 100644 index 6011f6e..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006142646.r +++ /dev/null @@ -1,681 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_point( - data = iPlot_temporal, - aes(x = TEMPORAL_DO, y = MEAN_DIFFERENCE, color = TIME), - position = position_jitterdodge(dodge.width = 0.5, jitter.width = 0.15), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_temporal_time, - aes( - xmin = as.numeric(TEMPORAL_DO) - 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - xmax = as.numeric(TEMPORAL_DO) + 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - ymin = ci_lower, ymax = ci_upper, - fill = TIME - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_temporal_time, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.25, - xend = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.25, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.25, - y = plot_mean, - color = TIME, - shape = TIME - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "TEMPORAL_DO", y = "Mean Difference", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_point( - data = iPlot_domain, - aes(x = DOMAIN, y = MEAN_DIFFERENCE, color = TIME), - position = position_jitterdodge(dodge.width = 0.5, jitter.width = 0.15), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_time_domain, - aes( - xmin = as.numeric(DOMAIN) - 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - xmax = as.numeric(DOMAIN) + 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - ymin = ci_lower, ymax = ci_upper, - fill = TIME - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - xend = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - y = plot_mean, - color = TIME, - shape = TIME - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "DOMAIN", y = "Mean Difference", - title = "TIME × DOMAIN Interaction" - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 45, hjust = 1), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_domain) diff --git a/.history/eohi1/mixed anova - domain means_20251006142658.r b/.history/eohi1/mixed anova - domain means_20251006142658.r deleted file mode 100644 index 6011f6e..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006142658.r +++ /dev/null @@ -1,681 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_point( - data = iPlot_temporal, - aes(x = TEMPORAL_DO, y = MEAN_DIFFERENCE, color = TIME), - position = position_jitterdodge(dodge.width = 0.5, jitter.width = 0.15), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_temporal_time, - aes( - xmin = as.numeric(TEMPORAL_DO) - 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - xmax = as.numeric(TEMPORAL_DO) + 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - ymin = ci_lower, ymax = ci_upper, - fill = TIME - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_temporal_time, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.25, - xend = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.25, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.25, - y = plot_mean, - color = TIME, - shape = TIME - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "TEMPORAL_DO", y = "Mean Difference", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_point( - data = iPlot_domain, - aes(x = DOMAIN, y = MEAN_DIFFERENCE, color = TIME), - position = position_jitterdodge(dodge.width = 0.5, jitter.width = 0.15), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_time_domain, - aes( - xmin = as.numeric(DOMAIN) - 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - xmax = as.numeric(DOMAIN) + 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - ymin = ci_lower, ymax = ci_upper, - fill = TIME - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - xend = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - y = plot_mean, - color = TIME, - shape = TIME - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "DOMAIN", y = "Mean Difference", - title = "TIME × DOMAIN Interaction" - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 45, hjust = 1), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_domain) diff --git a/.history/eohi1/mixed anova - domain means_20251006142703.r b/.history/eohi1/mixed anova - domain means_20251006142703.r deleted file mode 100644 index 6011f6e..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006142703.r +++ /dev/null @@ -1,681 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_point( - data = iPlot_temporal, - aes(x = TEMPORAL_DO, y = MEAN_DIFFERENCE, color = TIME), - position = position_jitterdodge(dodge.width = 0.5, jitter.width = 0.15), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_temporal_time, - aes( - xmin = as.numeric(TEMPORAL_DO) - 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - xmax = as.numeric(TEMPORAL_DO) + 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - ymin = ci_lower, ymax = ci_upper, - fill = TIME - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_temporal_time, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.25, - xend = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.25, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.25, - y = plot_mean, - color = TIME, - shape = TIME - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "TEMPORAL_DO", y = "Mean Difference", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_point( - data = iPlot_domain, - aes(x = DOMAIN, y = MEAN_DIFFERENCE, color = TIME), - position = position_jitterdodge(dodge.width = 0.5, jitter.width = 0.15), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_time_domain, - aes( - xmin = as.numeric(DOMAIN) - 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - xmax = as.numeric(DOMAIN) + 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - ymin = ci_lower, ymax = ci_upper, - fill = TIME - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - xend = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - y = plot_mean, - color = TIME, - shape = TIME - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "DOMAIN", y = "Mean Difference", - title = "TIME × DOMAIN Interaction" - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 45, hjust = 1), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_domain) diff --git a/.history/eohi1/mixed anova - domain means_20251006145249.r b/.history/eohi1/mixed anova - domain means_20251006145249.r deleted file mode 100644 index 6011f6e..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006145249.r +++ /dev/null @@ -1,681 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_point( - data = iPlot_temporal, - aes(x = TEMPORAL_DO, y = MEAN_DIFFERENCE, color = TIME), - position = position_jitterdodge(dodge.width = 0.5, jitter.width = 0.15), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_temporal_time, - aes( - xmin = as.numeric(TEMPORAL_DO) - 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - xmax = as.numeric(TEMPORAL_DO) + 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - ymin = ci_lower, ymax = ci_upper, - fill = TIME - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_temporal_time, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.25, - xend = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.25, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.25, - y = plot_mean, - color = TIME, - shape = TIME - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "TEMPORAL_DO", y = "Mean Difference", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_point( - data = iPlot_domain, - aes(x = DOMAIN, y = MEAN_DIFFERENCE, color = TIME), - position = position_jitterdodge(dodge.width = 0.5, jitter.width = 0.15), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_time_domain, - aes( - xmin = as.numeric(DOMAIN) - 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - xmax = as.numeric(DOMAIN) + 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - ymin = ci_lower, ymax = ci_upper, - fill = TIME - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - xend = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - y = plot_mean, - color = TIME, - shape = TIME - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "DOMAIN", y = "Mean Difference", - title = "TIME × DOMAIN Interaction" - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 45, hjust = 1), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_domain) diff --git a/.history/eohi1/mixed anova - domain means_20251006152026.r b/.history/eohi1/mixed anova - domain means_20251006152026.r deleted file mode 100644 index 4eb3a79..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006152026.r +++ /dev/null @@ -1,671 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = TEMPORAL_DO, y = MEAN_DIFFERENCE, fill = TIME), - position = position_dodge(width = 0.7), - alpha = 0.5, - color = "black", - trim = FALSE - ) + - geom_point( - data = emmeans_temporal_time, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.35, - y = plot_mean, - color = TIME - ), - size = 3, shape = 18 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.35, - ymin = ci_lower, ymax = ci_upper, - color = TIME - ), - width = 0.1, linewidth = 0.8 - ) + - labs( - x = "TEMPORAL_DO", y = "Mean Difference", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_point( - data = iPlot_domain, - aes(x = DOMAIN, y = MEAN_DIFFERENCE, color = TIME), - position = position_jitterdodge(dodge.width = 0.5, jitter.width = 0.15), - alpha = 0.3, shape = 16 - ) + - geom_rect( - data = emmeans_time_domain, - aes( - xmin = as.numeric(DOMAIN) - 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - xmax = as.numeric(DOMAIN) + 0.15 + (as.numeric(TIME) - 1.5) * 0.25, - ymin = ci_lower, ymax = ci_upper, - fill = TIME - ), - color = "black", alpha = 0.5 - ) + - geom_segment( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - xend = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - y = ci_lower, yend = ci_upper - ), - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.25, - y = plot_mean, - color = TIME, - shape = TIME - ), - size = 2.5, stroke = 0.8, fill = "black" - ) + - labs( - x = "DOMAIN", y = "Mean Difference", - title = "TIME × DOMAIN Interaction" - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 45, hjust = 1), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_domain) diff --git a/.history/eohi1/mixed anova - domain means_20251006152038.r b/.history/eohi1/mixed anova - domain means_20251006152038.r deleted file mode 100644 index 0dcf572..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006152038.r +++ /dev/null @@ -1,661 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = TEMPORAL_DO, y = MEAN_DIFFERENCE, fill = TIME), - position = position_dodge(width = 0.7), - alpha = 0.5, - color = "black", - trim = FALSE - ) + - geom_point( - data = emmeans_temporal_time, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.35, - y = plot_mean, - color = TIME - ), - size = 3, shape = 18 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.35, - ymin = ci_lower, ymax = ci_upper, - color = TIME - ), - width = 0.1, linewidth = 0.8 - ) + - labs( - x = "TEMPORAL_DO", y = "Mean Difference", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = DOMAIN, y = MEAN_DIFFERENCE, fill = TIME), - position = position_dodge(width = 0.7), - alpha = 0.5, - color = "black", - trim = FALSE - ) + - geom_point( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.35, - y = plot_mean, - color = TIME - ), - size = 3, shape = 18 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.35, - ymin = ci_lower, ymax = ci_upper, - color = TIME - ), - width = 0.1, linewidth = 0.8 - ) + - labs( - x = "DOMAIN", y = "Mean Difference", - title = "TIME × DOMAIN Interaction" - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 45, hjust = 1), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_domain) diff --git a/.history/eohi1/mixed anova - domain means_20251006152054.r b/.history/eohi1/mixed anova - domain means_20251006152054.r deleted file mode 100644 index 0dcf572..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006152054.r +++ /dev/null @@ -1,661 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = TEMPORAL_DO, y = MEAN_DIFFERENCE, fill = TIME), - position = position_dodge(width = 0.7), - alpha = 0.5, - color = "black", - trim = FALSE - ) + - geom_point( - data = emmeans_temporal_time, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.35, - y = plot_mean, - color = TIME - ), - size = 3, shape = 18 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.35, - ymin = ci_lower, ymax = ci_upper, - color = TIME - ), - width = 0.1, linewidth = 0.8 - ) + - labs( - x = "TEMPORAL_DO", y = "Mean Difference", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = DOMAIN, y = MEAN_DIFFERENCE, fill = TIME), - position = position_dodge(width = 0.7), - alpha = 0.5, - color = "black", - trim = FALSE - ) + - geom_point( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.35, - y = plot_mean, - color = TIME - ), - size = 3, shape = 18 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.35, - ymin = ci_lower, ymax = ci_upper, - color = TIME - ), - width = 0.1, linewidth = 0.8 - ) + - labs( - x = "DOMAIN", y = "Mean Difference", - title = "TIME × DOMAIN Interaction" - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 45, hjust = 1), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_domain) diff --git a/.history/eohi1/mixed anova - domain means_20251006153456.r b/.history/eohi1/mixed anova - domain means_20251006153456.r deleted file mode 100644 index 0dcf572..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006153456.r +++ /dev/null @@ -1,661 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = TEMPORAL_DO, y = MEAN_DIFFERENCE, fill = TIME), - position = position_dodge(width = 0.7), - alpha = 0.5, - color = "black", - trim = FALSE - ) + - geom_point( - data = emmeans_temporal_time, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.35, - y = plot_mean, - color = TIME - ), - size = 3, shape = 18 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes( - x = as.numeric(TEMPORAL_DO) + (as.numeric(TIME) - 1.5) * 0.35, - ymin = ci_lower, ymax = ci_upper, - color = TIME - ), - width = 0.1, linewidth = 0.8 - ) + - labs( - x = "TEMPORAL_DO", y = "Mean Difference", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 0, hjust = 0.5), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = DOMAIN, y = MEAN_DIFFERENCE, fill = TIME), - position = position_dodge(width = 0.7), - alpha = 0.5, - color = "black", - trim = FALSE - ) + - geom_point( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.35, - y = plot_mean, - color = TIME - ), - size = 3, shape = 18 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.35, - ymin = ci_lower, ymax = ci_upper, - color = TIME - ), - width = 0.1, linewidth = 0.8 - ) + - labs( - x = "DOMAIN", y = "Mean Difference", - title = "TIME × DOMAIN Interaction" - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 45, hjust = 1), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_domain) diff --git a/.history/eohi1/mixed anova - domain means_20251006155838.r b/.history/eohi1/mixed anova - domain means_20251006155838.r deleted file mode 100644 index 856ed60..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006155838.r +++ /dev/null @@ -1,682 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = DOMAIN, y = MEAN_DIFFERENCE, fill = TIME), - position = position_dodge(width = 0.7), - alpha = 0.5, - color = "black", - trim = FALSE - ) + - geom_point( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.35, - y = plot_mean, - color = TIME - ), - size = 3, shape = 18 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes( - x = as.numeric(DOMAIN) + (as.numeric(TIME) - 1.5) * 0.35, - ymin = ci_lower, ymax = ci_upper, - color = TIME - ), - width = 0.1, linewidth = 0.8 - ) + - labs( - x = "DOMAIN", y = "Mean Difference", - title = "TIME × DOMAIN Interaction" - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 45, hjust = 1), - plot.title = element_text(size = 14, hjust = 0.5) - ) - -print(interaction_plot_domain) diff --git a/.history/eohi1/mixed anova - domain means_20251006155859.r b/.history/eohi1/mixed anova - domain means_20251006155859.r deleted file mode 100644 index c4ac77e..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006155859.r +++ /dev/null @@ -1,704 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) diff --git a/.history/eohi1/mixed anova - domain means_20251006155915.r b/.history/eohi1/mixed anova - domain means_20251006155915.r deleted file mode 100644 index c4ac77e..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006155915.r +++ /dev/null @@ -1,704 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) diff --git a/.history/eohi1/mixed anova - domain means_20251006155938.r b/.history/eohi1/mixed anova - domain means_20251006155938.r deleted file mode 100644 index c4ac77e..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006155938.r +++ /dev/null @@ -1,704 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) diff --git a/.history/eohi1/mixed anova - domain means_20251006162736.r b/.history/eohi1/mixed anova - domain means_20251006162736.r deleted file mode 100644 index 7305661..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006162736.r +++ /dev/null @@ -1,763 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Reuse emmeans data from Plot 1 -emmeans_temporal_time_simple <- emmeans_temporal_time %>% - mutate( - TEMPORAL_DO_label = factor( - TEMPORAL_DO, - levels = c("01PAST", "02FUT"), - labels = c("Past First", "Future First") - ), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) diff --git a/.history/eohi1/mixed anova - domain means_20251006162743.r b/.history/eohi1/mixed anova - domain means_20251006162743.r deleted file mode 100644 index 7305661..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006162743.r +++ /dev/null @@ -1,763 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Reuse emmeans data from Plot 1 -emmeans_temporal_time_simple <- emmeans_temporal_time %>% - mutate( - TEMPORAL_DO_label = factor( - TEMPORAL_DO, - levels = c("01PAST", "02FUT"), - labels = c("Past First", "Future First") - ), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) diff --git a/.history/eohi1/mixed anova - domain means_20251006162748.r b/.history/eohi1/mixed anova - domain means_20251006162748.r deleted file mode 100644 index 7305661..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006162748.r +++ /dev/null @@ -1,763 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Reuse emmeans data from Plot 1 -emmeans_temporal_time_simple <- emmeans_temporal_time %>% - mutate( - TEMPORAL_DO_label = factor( - TEMPORAL_DO, - levels = c("01PAST", "02FUT"), - labels = c("Past First", "Future First") - ), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) diff --git a/.history/eohi1/mixed anova - domain means_20251006162833.r b/.history/eohi1/mixed anova - domain means_20251006162833.r deleted file mode 100644 index da083dc..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006162833.r +++ /dev/null @@ -1,769 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) diff --git a/.history/eohi1/mixed anova - domain means_20251006162839.r b/.history/eohi1/mixed anova - domain means_20251006162839.r deleted file mode 100644 index da083dc..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006162839.r +++ /dev/null @@ -1,769 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) diff --git a/.history/eohi1/mixed anova - domain means_20251006162940.r b/.history/eohi1/mixed anova - domain means_20251006162940.r deleted file mode 100644 index da083dc..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006162940.r +++ /dev/null @@ -1,769 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) diff --git a/.history/eohi1/mixed anova - domain means_20251006181905.r b/.history/eohi1/mixed anova - domain means_20251006181905.r deleted file mode 100644 index faf5d80..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006181905.r +++ /dev/null @@ -1,769 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) diff --git a/.history/eohi1/mixed anova - domain means_20251006181908.r b/.history/eohi1/mixed anova - domain means_20251006181908.r deleted file mode 100644 index faf5d80..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006181908.r +++ /dev/null @@ -1,769 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) diff --git a/.history/eohi1/mixed anova - domain means_20251006181914.r b/.history/eohi1/mixed anova - domain means_20251006181914.r deleted file mode 100644 index b31d839..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006181914.r +++ /dev/null @@ -1,769 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "TIME", values = time_colors) + - scale_fill_manual(name = "TIME", values = time_colors) + - scale_shape_manual(name = "TIME", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) diff --git a/.history/eohi1/mixed anova - domain means_20251006181923.r b/.history/eohi1/mixed anova - domain means_20251006181923.r deleted file mode 100644 index 214d37f..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006181923.r +++ /dev/null @@ -1,769 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) diff --git a/.history/eohi1/mixed anova - domain means_20251006181934.r b/.history/eohi1/mixed anova - domain means_20251006181934.r deleted file mode 100644 index 214d37f..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006181934.r +++ /dev/null @@ -1,769 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) diff --git a/.history/eohi1/mixed anova - domain means_20251006181935.r b/.history/eohi1/mixed anova - domain means_20251006181935.r deleted file mode 100644 index 214d37f..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006181935.r +++ /dev/null @@ -1,769 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) diff --git a/.history/eohi1/mixed anova - domain means_20251006183415.r b/.history/eohi1/mixed anova - domain means_20251006183415.r deleted file mode 100644 index 214d37f..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006183415.r +++ /dev/null @@ -1,769 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) diff --git a/.history/eohi1/mixed anova - domain means_20251006223538.r b/.history/eohi1/mixed anova - domain means_20251006223538.r deleted file mode 100644 index 214d37f..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006223538.r +++ /dev/null @@ -1,769 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) diff --git a/.history/eohi1/mixed anova - domain means_20251006225002.r b/.history/eohi1/mixed anova - domain means_20251006225002.r deleted file mode 100644 index 795012c..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006225002.r +++ /dev/null @@ -1,864 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.5 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) -interaction_plot_time_domain_plot4 <- ggplot() + - geom_violin( - data = iPlot_plot4, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_plot4 / 2 - ) + - geom_errorbar( - data = emmeans_time_domain_plot4, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain_plot4, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis)", - fill = "Temporal Direction", - shape = "Temporal Direction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251006225006.r b/.history/eohi1/mixed anova - domain means_20251006225006.r deleted file mode 100644 index 795012c..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006225006.r +++ /dev/null @@ -1,864 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.5 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) -interaction_plot_time_domain_plot4 <- ggplot() + - geom_violin( - data = iPlot_plot4, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_plot4 / 2 - ) + - geom_errorbar( - data = emmeans_time_domain_plot4, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain_plot4, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis)", - fill = "Temporal Direction", - shape = "Temporal Direction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251006225025.r b/.history/eohi1/mixed anova - domain means_20251006225025.r deleted file mode 100644 index 795012c..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006225025.r +++ /dev/null @@ -1,864 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.5 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) -interaction_plot_time_domain_plot4 <- ggplot() + - geom_violin( - data = iPlot_plot4, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_plot4 / 2 - ) + - geom_errorbar( - data = emmeans_time_domain_plot4, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain_plot4, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis)", - fill = "Temporal Direction", - shape = "Temporal Direction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251006225127.r b/.history/eohi1/mixed anova - domain means_20251006225127.r deleted file mode 100644 index 850f677..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006225127.r +++ /dev/null @@ -1,853 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.5 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) - EMMeans only -interaction_plot_time_domain_plot4 <- ggplot(emmeans_time_domain_plot4) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis)", - fill = "Temporal Direction", - shape = "Temporal Direction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251006225134.r b/.history/eohi1/mixed anova - domain means_20251006225134.r deleted file mode 100644 index 85ad0c2..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006225134.r +++ /dev/null @@ -1,852 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.5 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) - EMMeans only -interaction_plot_time_domain_plot4 <- ggplot(emmeans_time_domain_plot4) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis) - Estimated Marginal Means" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251006225138.r b/.history/eohi1/mixed anova - domain means_20251006225138.r deleted file mode 100644 index 85ad0c2..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006225138.r +++ /dev/null @@ -1,852 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.5 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) - EMMeans only -interaction_plot_time_domain_plot4 <- ggplot(emmeans_time_domain_plot4) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis) - Estimated Marginal Means" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251006225148.r b/.history/eohi1/mixed anova - domain means_20251006225148.r deleted file mode 100644 index 85ad0c2..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006225148.r +++ /dev/null @@ -1,852 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.5 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) - EMMeans only -interaction_plot_time_domain_plot4 <- ggplot(emmeans_time_domain_plot4) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis) - Estimated Marginal Means" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251006225428.r b/.history/eohi1/mixed anova - domain means_20251006225428.r deleted file mode 100644 index ae360b0..0000000 --- a/.history/eohi1/mixed anova - domain means_20251006225428.r +++ /dev/null @@ -1,852 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.2 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) - EMMeans only -interaction_plot_time_domain_plot4 <- ggplot(emmeans_time_domain_plot4) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis) - Estimated Marginal Means" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251007155303.r b/.history/eohi1/mixed anova - domain means_20251007155303.r deleted file mode 100644 index ae360b0..0000000 --- a/.history/eohi1/mixed anova - domain means_20251007155303.r +++ /dev/null @@ -1,852 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - -# ============================================================ -# PLOT 1: TEMPORAL_DO × TIME INTERACTION -# ============================================================ - -# Create estimated marginal means for TEMPORAL_DO × TIME -emm_temporal_time <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -# Prepare emmeans data frame -emmeans_temporal_time <- emm_temporal_time %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_temporal <- 0.5 -iPlot_temporal <- long_data_clean %>% - dplyr::select(pID, TEMPORAL_DO, TIME, MEAN_DIFFERENCE) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -emmeans_temporal_time <- emmeans_temporal_time %>% - mutate( - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_temporal, - x_dodged = x_pos + time_offset - ) - -# Create TEMPORAL_DO × TIME interaction plot -interaction_plot_temporal <- ggplot() + - geom_violin( - data = iPlot_temporal, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_temporal / 2 - ) + - geom_errorbar( - data = emmeans_temporal_time, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_temporal_time, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_temporal) - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.2 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) - EMMeans only -interaction_plot_time_domain_plot4 <- ggplot(emmeans_time_domain_plot4) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis) - Estimated Marginal Means" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251007162057.r b/.history/eohi1/mixed anova - domain means_20251007162057.r deleted file mode 100644 index 5e40915..0000000 --- a/.history/eohi1/mixed anova - domain means_20251007162057.r +++ /dev/null @@ -1,761 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.2 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) - EMMeans only -interaction_plot_time_domain_plot4 <- ggplot(emmeans_time_domain_plot4) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis) - Estimated Marginal Means" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251007162101.r b/.history/eohi1/mixed anova - domain means_20251007162101.r deleted file mode 100644 index 5e40915..0000000 --- a/.history/eohi1/mixed anova - domain means_20251007162101.r +++ /dev/null @@ -1,761 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - - -# ============================================================ -# PLOT 2: TIME × DOMAIN INTERACTION (DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN -emm_time_domain <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_time_domain <- emm_time_domain %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_domain <- 0.5 -iPlot_domain <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain <- emmeans_time_domain %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_domain, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot -interaction_plot_domain <- ggplot() + - geom_violin( - data = iPlot_domain, - aes(x = x_dodged, y = MEAN_DIFFERENCE, fill = TIME, group = interaction(x_pos, TIME)), - alpha = 0.4, - color = NA, - trim = FALSE, - scale = "width", - width = dodge_width_domain / 2 - ) + - geom_errorbar( - data = emmeans_time_domain, - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper), - width = 0.08, - linewidth = 0.8, - color = "black" - ) + - geom_point( - data = emmeans_time_domain, - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 4, - stroke = 1, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_domain) - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.2 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) - EMMeans only -interaction_plot_time_domain_plot4 <- ggplot(emmeans_time_domain_plot4) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis) - Estimated Marginal Means" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251007162111.r b/.history/eohi1/mixed anova - domain means_20251007162111.r deleted file mode 100644 index 6e5cdc0..0000000 --- a/.history/eohi1/mixed anova - domain means_20251007162111.r +++ /dev/null @@ -1,669 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - - - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.2 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) - EMMeans only -interaction_plot_time_domain_plot4 <- ggplot(emmeans_time_domain_plot4) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis) - Estimated Marginal Means" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251007162115.r b/.history/eohi1/mixed anova - domain means_20251007162115.r deleted file mode 100644 index 6e5cdc0..0000000 --- a/.history/eohi1/mixed anova - domain means_20251007162115.r +++ /dev/null @@ -1,669 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - - - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.2 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) - EMMeans only -interaction_plot_time_domain_plot4 <- ggplot(emmeans_time_domain_plot4) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis) - Estimated Marginal Means" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251007162122.r b/.history/eohi1/mixed anova - domain means_20251007162122.r deleted file mode 100644 index d3d6957..0000000 --- a/.history/eohi1/mixed anova - domain means_20251007162122.r +++ /dev/null @@ -1,723 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - - - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.2 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) - EMMeans only -interaction_plot_time_domain_plot4 <- ggplot(emmeans_time_domain_plot4) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis) - Estimated Marginal Means" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) - -# ============================================================ -# PLOT 5: TIME MAIN EFFECT (Emmeans + Error Bars Only) -# ============================================================ - -# Prepare emmeans data frame for TIME main effect -time_main_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with existing emmeans-only plots) -time_main_plot <- ggplot(time_main_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251007162130.r b/.history/eohi1/mixed anova - domain means_20251007162130.r deleted file mode 100644 index d3d6957..0000000 --- a/.history/eohi1/mixed anova - domain means_20251007162130.r +++ /dev/null @@ -1,723 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - - - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.2 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) - EMMeans only -interaction_plot_time_domain_plot4 <- ggplot(emmeans_time_domain_plot4) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis) - Estimated Marginal Means" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) - -# ============================================================ -# PLOT 5: TIME MAIN EFFECT (Emmeans + Error Bars Only) -# ============================================================ - -# Prepare emmeans data frame for TIME main effect -time_main_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with existing emmeans-only plots) -time_main_plot <- ggplot(time_main_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251007162138.r b/.history/eohi1/mixed anova - domain means_20251007162138.r deleted file mode 100644 index d3d6957..0000000 --- a/.history/eohi1/mixed anova - domain means_20251007162138.r +++ /dev/null @@ -1,723 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - - - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.2 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) - EMMeans only -interaction_plot_time_domain_plot4 <- ggplot(emmeans_time_domain_plot4) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis) - Estimated Marginal Means" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) - -# ============================================================ -# PLOT 5: TIME MAIN EFFECT (Emmeans + Error Bars Only) -# ============================================================ - -# Prepare emmeans data frame for TIME main effect -time_main_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with existing emmeans-only plots) -time_main_plot <- ggplot(time_main_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251007162843.r b/.history/eohi1/mixed anova - domain means_20251007162843.r deleted file mode 100644 index d3d6957..0000000 --- a/.history/eohi1/mixed anova - domain means_20251007162843.r +++ /dev/null @@ -1,723 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - - - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.2 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) - EMMeans only -interaction_plot_time_domain_plot4 <- ggplot(emmeans_time_domain_plot4) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis) - Estimated Marginal Means" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) - -# ============================================================ -# PLOT 5: TIME MAIN EFFECT (Emmeans + Error Bars Only) -# ============================================================ - -# Prepare emmeans data frame for TIME main effect -time_main_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with existing emmeans-only plots) -time_main_plot <- ggplot(time_main_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251007182023.r b/.history/eohi1/mixed anova - domain means_20251007182023.r deleted file mode 100644 index 80640de..0000000 --- a/.history/eohi1/mixed anova - domain means_20251007182023.r +++ /dev/null @@ -1,768 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of TIME within each DOMAIN (Past vs Future contrasts for each domain) -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -print("=== TIME × DOMAIN INTERACTION: Simple Effects of TIME within each DOMAIN ===") -print("Past vs Future contrasts for each domain:") -print(time_domain_simple2_df) - -# Calculate Cohen's d for Past vs Future contrasts within each domain -print("\n=== COHEN'S D FOR TIME CONTRASTS WITHIN EACH DOMAIN ===") -significant_time_domain <- time_domain_simple2_df[time_domain_simple2_df$p.value < 0.05, ] - -if(nrow(significant_time_domain) > 0) { - print("Significant Past vs Future contrasts within domains (p < 0.05):") - print(significant_time_domain) - - print("\nCohen's d calculations for Past vs Future within each domain:") - - for(i in seq_len(nrow(time_domain_simple2_df))) { - comparison <- time_domain_simple2_df[i, ] - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this domain - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("Domain: %s\n", domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant Past vs Future contrasts found within any domain.\n") -} - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - - - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.2 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) - EMMeans only -interaction_plot_time_domain_plot4 <- ggplot(emmeans_time_domain_plot4) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis) - Estimated Marginal Means" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) - -# ============================================================ -# PLOT 5: TIME MAIN EFFECT (Emmeans + Error Bars Only) -# ============================================================ - -# Prepare emmeans data frame for TIME main effect -time_main_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with existing emmeans-only plots) -time_main_plot <- ggplot(time_main_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251007182033.r b/.history/eohi1/mixed anova - domain means_20251007182033.r deleted file mode 100644 index 80640de..0000000 --- a/.history/eohi1/mixed anova - domain means_20251007182033.r +++ /dev/null @@ -1,768 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of TIME within each DOMAIN (Past vs Future contrasts for each domain) -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -print("=== TIME × DOMAIN INTERACTION: Simple Effects of TIME within each DOMAIN ===") -print("Past vs Future contrasts for each domain:") -print(time_domain_simple2_df) - -# Calculate Cohen's d for Past vs Future contrasts within each domain -print("\n=== COHEN'S D FOR TIME CONTRASTS WITHIN EACH DOMAIN ===") -significant_time_domain <- time_domain_simple2_df[time_domain_simple2_df$p.value < 0.05, ] - -if(nrow(significant_time_domain) > 0) { - print("Significant Past vs Future contrasts within domains (p < 0.05):") - print(significant_time_domain) - - print("\nCohen's d calculations for Past vs Future within each domain:") - - for(i in seq_len(nrow(time_domain_simple2_df))) { - comparison <- time_domain_simple2_df[i, ] - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this domain - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("Domain: %s\n", domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant Past vs Future contrasts found within any domain.\n") -} - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - - - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.2 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) - EMMeans only -interaction_plot_time_domain_plot4 <- ggplot(emmeans_time_domain_plot4) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis) - Estimated Marginal Means" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) - -# ============================================================ -# PLOT 5: TIME MAIN EFFECT (Emmeans + Error Bars Only) -# ============================================================ - -# Prepare emmeans data frame for TIME main effect -time_main_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with existing emmeans-only plots) -time_main_plot <- ggplot(time_main_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251007182343.r b/.history/eohi1/mixed anova - domain means_20251007182343.r deleted file mode 100644 index 80640de..0000000 --- a/.history/eohi1/mixed anova - domain means_20251007182343.r +++ /dev/null @@ -1,768 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of TIME within each DOMAIN (Past vs Future contrasts for each domain) -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -print("=== TIME × DOMAIN INTERACTION: Simple Effects of TIME within each DOMAIN ===") -print("Past vs Future contrasts for each domain:") -print(time_domain_simple2_df) - -# Calculate Cohen's d for Past vs Future contrasts within each domain -print("\n=== COHEN'S D FOR TIME CONTRASTS WITHIN EACH DOMAIN ===") -significant_time_domain <- time_domain_simple2_df[time_domain_simple2_df$p.value < 0.05, ] - -if(nrow(significant_time_domain) > 0) { - print("Significant Past vs Future contrasts within domains (p < 0.05):") - print(significant_time_domain) - - print("\nCohen's d calculations for Past vs Future within each domain:") - - for(i in seq_len(nrow(time_domain_simple2_df))) { - comparison <- time_domain_simple2_df[i, ] - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this domain - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("Domain: %s\n", domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant Past vs Future contrasts found within any domain.\n") -} - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - - - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.2 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) - EMMeans only -interaction_plot_time_domain_plot4 <- ggplot(emmeans_time_domain_plot4) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis) - Estimated Marginal Means" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) - -# ============================================================ -# PLOT 5: TIME MAIN EFFECT (Emmeans + Error Bars Only) -# ============================================================ - -# Prepare emmeans data frame for TIME main effect -time_main_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with existing emmeans-only plots) -time_main_plot <- ggplot(time_main_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251007182949.r b/.history/eohi1/mixed anova - domain means_20251007182949.r deleted file mode 100644 index 8d5ac17..0000000 --- a/.history/eohi1/mixed anova - domain means_20251007182949.r +++ /dev/null @@ -1,768 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of TIME within each DOMAIN (Past vs Future contrasts for each domain) -time_domain_simple_df <- as.data.frame(time_domain_simple) -print("=== TIME × DOMAIN INTERACTION: Simple Effects of TIME within each DOMAIN ===") -print("Past vs Future contrasts for each domain:") -print(time_domain_simple_df) - -# Calculate Cohen's d for Past vs Future contrasts within each domain -print("\n=== COHEN'S D FOR TIME CONTRASTS WITHIN EACH DOMAIN ===") -significant_time_domain <- time_domain_simple_df[time_domain_simple_df$p.value < 0.05, ] - -if(nrow(significant_time_domain) > 0) { - print("Significant Past vs Future contrasts within domains (p < 0.05):") - print(significant_time_domain) - - print("\nCohen's d calculations for Past vs Future within each domain:") - - for(i in seq_len(nrow(time_domain_simple_df))) { - comparison <- time_domain_simple_df[i, ] - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this domain - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("Domain: %s\n", domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant Past vs Future contrasts found within any domain.\n") -} - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - - - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.2 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) - EMMeans only -interaction_plot_time_domain_plot4 <- ggplot(emmeans_time_domain_plot4) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis) - Estimated Marginal Means" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) - -# ============================================================ -# PLOT 5: TIME MAIN EFFECT (Emmeans + Error Bars Only) -# ============================================================ - -# Prepare emmeans data frame for TIME main effect -time_main_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with existing emmeans-only plots) -time_main_plot <- ggplot(time_main_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251007182951.r b/.history/eohi1/mixed anova - domain means_20251007182951.r deleted file mode 100644 index 8d5ac17..0000000 --- a/.history/eohi1/mixed anova - domain means_20251007182951.r +++ /dev/null @@ -1,768 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of TIME within each DOMAIN (Past vs Future contrasts for each domain) -time_domain_simple_df <- as.data.frame(time_domain_simple) -print("=== TIME × DOMAIN INTERACTION: Simple Effects of TIME within each DOMAIN ===") -print("Past vs Future contrasts for each domain:") -print(time_domain_simple_df) - -# Calculate Cohen's d for Past vs Future contrasts within each domain -print("\n=== COHEN'S D FOR TIME CONTRASTS WITHIN EACH DOMAIN ===") -significant_time_domain <- time_domain_simple_df[time_domain_simple_df$p.value < 0.05, ] - -if(nrow(significant_time_domain) > 0) { - print("Significant Past vs Future contrasts within domains (p < 0.05):") - print(significant_time_domain) - - print("\nCohen's d calculations for Past vs Future within each domain:") - - for(i in seq_len(nrow(time_domain_simple_df))) { - comparison <- time_domain_simple_df[i, ] - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this domain - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("Domain: %s\n", domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant Past vs Future contrasts found within any domain.\n") -} - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - - - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.2 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) - EMMeans only -interaction_plot_time_domain_plot4 <- ggplot(emmeans_time_domain_plot4) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis) - Estimated Marginal Means" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) - -# ============================================================ -# PLOT 5: TIME MAIN EFFECT (Emmeans + Error Bars Only) -# ============================================================ - -# Prepare emmeans data frame for TIME main effect -time_main_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with existing emmeans-only plots) -time_main_plot <- ggplot(time_main_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251007182953.r b/.history/eohi1/mixed anova - domain means_20251007182953.r deleted file mode 100644 index 8d5ac17..0000000 --- a/.history/eohi1/mixed anova - domain means_20251007182953.r +++ /dev/null @@ -1,768 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of TIME within each DOMAIN (Past vs Future contrasts for each domain) -time_domain_simple_df <- as.data.frame(time_domain_simple) -print("=== TIME × DOMAIN INTERACTION: Simple Effects of TIME within each DOMAIN ===") -print("Past vs Future contrasts for each domain:") -print(time_domain_simple_df) - -# Calculate Cohen's d for Past vs Future contrasts within each domain -print("\n=== COHEN'S D FOR TIME CONTRASTS WITHIN EACH DOMAIN ===") -significant_time_domain <- time_domain_simple_df[time_domain_simple_df$p.value < 0.05, ] - -if(nrow(significant_time_domain) > 0) { - print("Significant Past vs Future contrasts within domains (p < 0.05):") - print(significant_time_domain) - - print("\nCohen's d calculations for Past vs Future within each domain:") - - for(i in seq_len(nrow(time_domain_simple_df))) { - comparison <- time_domain_simple_df[i, ] - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this domain - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("Domain: %s\n", domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant Past vs Future contrasts found within any domain.\n") -} - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - - - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.2 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) - EMMeans only -interaction_plot_time_domain_plot4 <- ggplot(emmeans_time_domain_plot4) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis) - Estimated Marginal Means" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) - -# ============================================================ -# PLOT 5: TIME MAIN EFFECT (Emmeans + Error Bars Only) -# ============================================================ - -# Prepare emmeans data frame for TIME main effect -time_main_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with existing emmeans-only plots) -time_main_plot <- ggplot(time_main_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251007183630.r b/.history/eohi1/mixed anova - domain means_20251007183630.r deleted file mode 100644 index db70ef8..0000000 --- a/.history/eohi1/mixed anova - domain means_20251007183630.r +++ /dev/null @@ -1,769 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of TIME within each DOMAIN (Past vs Future contrasts for each domain) -time_domain_simple_by_domain <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -time_domain_simple_df <- as.data.frame(time_domain_simple_by_domain) -print("=== TIME × DOMAIN INTERACTION: Simple Effects of TIME within each DOMAIN ===") -print("Past vs Future contrasts for each domain:") -print(time_domain_simple_df) - -# Calculate Cohen's d for Past vs Future contrasts within each domain -print("\n=== COHEN'S D FOR TIME CONTRASTS WITHIN EACH DOMAIN ===") -significant_time_domain <- time_domain_simple_df[time_domain_simple_df$p.value < 0.05, ] - -if(nrow(significant_time_domain) > 0) { - print("Significant Past vs Future contrasts within domains (p < 0.05):") - print(significant_time_domain) - - print("\nCohen's d calculations for Past vs Future within each domain:") - - for(i in seq_len(nrow(time_domain_simple_df))) { - comparison <- time_domain_simple_df[i, ] - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this domain - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("Domain: %s\n", domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant Past vs Future contrasts found within any domain.\n") -} - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - - - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.2 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) - EMMeans only -interaction_plot_time_domain_plot4 <- ggplot(emmeans_time_domain_plot4) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis) - Estimated Marginal Means" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) - -# ============================================================ -# PLOT 5: TIME MAIN EFFECT (Emmeans + Error Bars Only) -# ============================================================ - -# Prepare emmeans data frame for TIME main effect -time_main_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with existing emmeans-only plots) -time_main_plot <- ggplot(time_main_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251007183634.r b/.history/eohi1/mixed anova - domain means_20251007183634.r deleted file mode 100644 index db70ef8..0000000 --- a/.history/eohi1/mixed anova - domain means_20251007183634.r +++ /dev/null @@ -1,769 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of TIME within each DOMAIN (Past vs Future contrasts for each domain) -time_domain_simple_by_domain <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -time_domain_simple_df <- as.data.frame(time_domain_simple_by_domain) -print("=== TIME × DOMAIN INTERACTION: Simple Effects of TIME within each DOMAIN ===") -print("Past vs Future contrasts for each domain:") -print(time_domain_simple_df) - -# Calculate Cohen's d for Past vs Future contrasts within each domain -print("\n=== COHEN'S D FOR TIME CONTRASTS WITHIN EACH DOMAIN ===") -significant_time_domain <- time_domain_simple_df[time_domain_simple_df$p.value < 0.05, ] - -if(nrow(significant_time_domain) > 0) { - print("Significant Past vs Future contrasts within domains (p < 0.05):") - print(significant_time_domain) - - print("\nCohen's d calculations for Past vs Future within each domain:") - - for(i in seq_len(nrow(time_domain_simple_df))) { - comparison <- time_domain_simple_df[i, ] - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this domain - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("Domain: %s\n", domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant Past vs Future contrasts found within any domain.\n") -} - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - - - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.2 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) - EMMeans only -interaction_plot_time_domain_plot4 <- ggplot(emmeans_time_domain_plot4) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis) - Estimated Marginal Means" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) - -# ============================================================ -# PLOT 5: TIME MAIN EFFECT (Emmeans + Error Bars Only) -# ============================================================ - -# Prepare emmeans data frame for TIME main effect -time_main_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with existing emmeans-only plots) -time_main_plot <- ggplot(time_main_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251007183638.r b/.history/eohi1/mixed anova - domain means_20251007183638.r deleted file mode 100644 index db70ef8..0000000 --- a/.history/eohi1/mixed anova - domain means_20251007183638.r +++ /dev/null @@ -1,769 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of TIME within each DOMAIN (Past vs Future contrasts for each domain) -time_domain_simple_by_domain <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -time_domain_simple_df <- as.data.frame(time_domain_simple_by_domain) -print("=== TIME × DOMAIN INTERACTION: Simple Effects of TIME within each DOMAIN ===") -print("Past vs Future contrasts for each domain:") -print(time_domain_simple_df) - -# Calculate Cohen's d for Past vs Future contrasts within each domain -print("\n=== COHEN'S D FOR TIME CONTRASTS WITHIN EACH DOMAIN ===") -significant_time_domain <- time_domain_simple_df[time_domain_simple_df$p.value < 0.05, ] - -if(nrow(significant_time_domain) > 0) { - print("Significant Past vs Future contrasts within domains (p < 0.05):") - print(significant_time_domain) - - print("\nCohen's d calculations for Past vs Future within each domain:") - - for(i in seq_len(nrow(time_domain_simple_df))) { - comparison <- time_domain_simple_df[i, ] - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this domain - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("Domain: %s\n", domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant Past vs Future contrasts found within any domain.\n") -} - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - - - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.2 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) - EMMeans only -interaction_plot_time_domain_plot4 <- ggplot(emmeans_time_domain_plot4) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis) - Estimated Marginal Means" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) - -# ============================================================ -# PLOT 5: TIME MAIN EFFECT (Emmeans + Error Bars Only) -# ============================================================ - -# Prepare emmeans data frame for TIME main effect -time_main_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with existing emmeans-only plots) -time_main_plot <- ggplot(time_main_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251007183824.r b/.history/eohi1/mixed anova - domain means_20251007183824.r deleted file mode 100644 index db70ef8..0000000 --- a/.history/eohi1/mixed anova - domain means_20251007183824.r +++ /dev/null @@ -1,769 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of TIME within each DOMAIN (Past vs Future contrasts for each domain) -time_domain_simple_by_domain <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -time_domain_simple_df <- as.data.frame(time_domain_simple_by_domain) -print("=== TIME × DOMAIN INTERACTION: Simple Effects of TIME within each DOMAIN ===") -print("Past vs Future contrasts for each domain:") -print(time_domain_simple_df) - -# Calculate Cohen's d for Past vs Future contrasts within each domain -print("\n=== COHEN'S D FOR TIME CONTRASTS WITHIN EACH DOMAIN ===") -significant_time_domain <- time_domain_simple_df[time_domain_simple_df$p.value < 0.05, ] - -if(nrow(significant_time_domain) > 0) { - print("Significant Past vs Future contrasts within domains (p < 0.05):") - print(significant_time_domain) - - print("\nCohen's d calculations for Past vs Future within each domain:") - - for(i in seq_len(nrow(time_domain_simple_df))) { - comparison <- time_domain_simple_df[i, ] - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this domain - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("Domain: %s\n", domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant Past vs Future contrasts found within any domain.\n") -} - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - - - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.2 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) - EMMeans only -interaction_plot_time_domain_plot4 <- ggplot(emmeans_time_domain_plot4) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis) - Estimated Marginal Means" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) - -# ============================================================ -# PLOT 5: TIME MAIN EFFECT (Emmeans + Error Bars Only) -# ============================================================ - -# Prepare emmeans data frame for TIME main effect -time_main_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with existing emmeans-only plots) -time_main_plot <- ggplot(time_main_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - domain means_20251010145938.r b/.history/eohi1/mixed anova - domain means_20251010145938.r deleted file mode 100644 index db70ef8..0000000 --- a/.history/eohi1/mixed anova - domain means_20251010145938.r +++ /dev/null @@ -1,769 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations -library(ggplot2) # For plotting - -options(scipen = 999) - -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'domain' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values", "Life")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print(desc_stats_by_temporal) - -# ASSUMPTION TESTING - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print(missing_summary) - -# Create clean dataset (long_data is already filtered for NA values) -long_data_clean <- long_data - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - median = median(MEAN_DIFFERENCE), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print(homogeneity_domain) - -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -print(unique(long_data_clean$TEMPORAL_DO)) -print(table(long_data_clean$TEMPORAL_DO)) - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# MIXED ANOVA ANALYSIS - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(complete_cases) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# MIXED ANOVA WITH SPHERICITY CORRECTIONS - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Between-subjects effects (no sphericity corrections needed) - between_effects <- c("TEMPORAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (4 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# COHEN'S D FOR MAIN EFFECTS - -# Create aov model for emmeans -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -# Main Effect of TIME -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) - -time_main_contrast <- pairs(time_emmeans, adjust = "Bonferroni") - -time_main_df <- as.data.frame(time_main_contrast) -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (significant: p < 0.001) -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# SIMPLE EFFECTS AND PAIRWISE COMPARISONS FOR INTERACTIONS - -# 1. TEMPORAL_DO × TIME INTERACTION - Simple effects of TIME within each TEMPORAL_DO -temporal_time_emmeans <- emmeans(aov_model, ~ TIME | TEMPORAL_DO) -temporal_time_simple <- pairs(temporal_time_emmeans, adjust = "bonferroni") -print(temporal_time_simple) - -# 2. TIME × DOMAIN INTERACTION - Simple effects of DOMAIN within each TIME -time_domain_emmeans <- emmeans(aov_model, ~ DOMAIN | TIME) -time_domain_simple <- pairs(time_domain_emmeans, adjust = "bonferroni") -print(time_domain_simple) - -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# 1. TEMPORAL_DO × TIME INTERACTION - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# 2. TIME × DOMAIN INTERACTION - -# Get simple effects of TIME within each DOMAIN (Past vs Future contrasts for each domain) -time_domain_simple_by_domain <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -time_domain_simple_df <- as.data.frame(time_domain_simple_by_domain) -print("=== TIME × DOMAIN INTERACTION: Simple Effects of TIME within each DOMAIN ===") -print("Past vs Future contrasts for each domain:") -print(time_domain_simple_df) - -# Calculate Cohen's d for Past vs Future contrasts within each domain -print("\n=== COHEN'S D FOR TIME CONTRASTS WITHIN EACH DOMAIN ===") -significant_time_domain <- time_domain_simple_df[time_domain_simple_df$p.value < 0.05, ] - -if(nrow(significant_time_domain) > 0) { - print("Significant Past vs Future contrasts within domains (p < 0.05):") - print(significant_time_domain) - - print("\nCohen's d calculations for Past vs Future within each domain:") - - for(i in seq_len(nrow(time_domain_simple_df))) { - comparison <- time_domain_simple_df[i, ] - domain_level <- as.character(comparison$DOMAIN) - - # Get data for Past and Future within this domain - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("Domain: %s\n", domain_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant Past vs Future contrasts found within any domain.\n") -} - -# INTERACTION PLOTS - -# Define color palettes -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") -domain_colors <- c("Preferences" = "#648FFF", "Personality" = "#DC267F", - "Values" = "#FFB000", "Life" = "#FE6100") - -# Define TIME levels (Past, Future order) -time_levels <- c("Past", "Future") - - - -# ============================================================ -# PLOT 3: TEMPORAL_DO × TIME INTERACTION (Emmeans only) -# ============================================================ - -# Create fresh emmeans data for Plot 3 -emm_temporal_time_plot3 <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - -emmeans_temporal_time_simple <- emm_temporal_time_plot3 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create simple emmeans-only plot -interaction_plot_emmeans_only <- ggplot(emmeans_temporal_time_simple) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_emmeans_only) - -# ============================================================ -# PLOT 4: TIME × DOMAIN INTERACTION (TIME on y-axis, DOMAIN on x-axis) -# ============================================================ - -# Create estimated marginal means for TIME × DOMAIN (reusing existing emmeans) -emm_time_domain_plot4 <- emmeans(aov_model, ~ TIME * DOMAIN) - -# Prepare emmeans data frame for Plot 4 -emmeans_time_domain_plot4 <- emm_time_domain_plot4 %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels) - ) - -# Prepare raw data for plotting with position offsets -dodge_width_plot4 <- 0.2 -iPlot_plot4 <- long_data_clean %>% - dplyr::select(pID, DOMAIN, TIME, MEAN_DIFFERENCE) %>% - mutate( - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")), - TIME = factor(TIME, levels = time_levels), - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -emmeans_time_domain_plot4 <- emmeans_time_domain_plot4 %>% - mutate( - x_pos = as.numeric(DOMAIN), - time_offset = (as.numeric(TIME) - 1.5) * dodge_width_plot4, - x_dodged = x_pos + time_offset - ) - -# Create TIME × DOMAIN interaction plot (Domain on x-axis, TIME as groups) - EMMeans only -interaction_plot_time_domain_plot4 <- ggplot(emmeans_time_domain_plot4) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Domain", - y = "Mean absolute difference from present", - title = "TIME × DOMAIN Interaction (Domain on x-axis) - Estimated Marginal Means" - ) + - scale_x_continuous( - breaks = c(1, 2, 3, 4), - labels = c("Preferences", "Personality", "Values", "Life"), - limits = c(0.5, 4.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.text.x = element_text(angle = 45, hjust = 1), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_domain_plot4) - -# ============================================================ -# PLOT 5: TIME MAIN EFFECT (Emmeans + Error Bars Only) -# ============================================================ - -# Prepare emmeans data frame for TIME main effect -time_main_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with existing emmeans-only plots) -time_main_plot <- ggplot(time_main_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi1/mixed anova - personality_20250916123628.r b/.history/eohi1/mixed anova - personality_20250916123628.r deleted file mode 100644 index 70d9919..0000000 --- a/.history/eohi1/mixed anova - personality_20250916123628.r +++ /dev/null @@ -1,765 +0,0 @@ -# Mixed ANOVA Analysis for Personality Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_pers_extravert, NPastDiff_pers_critical, NPastDiff_pers_dependable, NPastDiff_pers_anxious, NPastDiff_pers_complex -# NFutDiff_pers_extravert, NFutDiff_pers_critical, NFutDiff_pers_dependable, NFutDiff_pers_anxious, NFutDiff_pers_complex - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", "NPastDiff_pers_anxious", "NPastDiff_pers_complex", - "NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", "NFutDiff_pers_anxious", "NFutDiff_pers_complex") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required personality item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", "NPastDiff_pers_anxious", "NPastDiff_pers_complex", - "NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", "NFutDiff_pers_anxious", "NFutDiff_pers_complex"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("extravert", "critical", "dependable", "anxious", "complex"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("extravert", "critical", "dependable", "anxious", "complex")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × ITEM) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × ITEM combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, ITEM, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × ITEM combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, ITEM, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + ITEM, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of ITEM levels:", length(levels(long_data_clean$ITEM)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 10, # 2 TIME × 5 ITEM = 10 - missing_combinations = 10 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × ITEM (5 levels: read, music, tv, nap, travel) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * ITEM + Error(pID/(TIME * ITEM)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "ITEM")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for ITEM (5 levels - within-subjects) -print("Mauchly's Test of Sphericity for ITEM:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_item <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = ITEM, - type = 3, - detailed = TRUE) - - print("ITEM Sphericity Test:") - print(ez_item$Mauchly) - -}, error = function(e) { - print(paste("Error in ITEM sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × ITEM interaction -print("\nMauchly's Test of Sphericity for TIME × ITEM Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - - print("TIME × ITEM Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × ITEM sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, ITEM), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to ITEM effects:") - - # ITEM main effect (DFn = 4, DFd = 4244) - item_df_corrected_gg <- 4 * epsilon_gg - item_df_corrected_hf <- 4 * epsilon_hf - - print(paste("ITEM: Original df = 4, 4244")) - print(paste("ITEM: GG corrected df =", round(item_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("ITEM: HF corrected df =", round(item_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # TIME × ITEM interaction (DFn = 4, DFd = 4244) - interaction_df_corrected_gg <- 4 * epsilon_gg - interaction_df_corrected_hf <- 4 * epsilon_hf - - print(paste("TIME × ITEM: Original df = 4, 4244")) - print(paste("TIME × ITEM: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("TIME × ITEM: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of ITEM -print("\nMain Effect of ITEM:") -item_emmeans <- emmeans(mixed_anova_model, ~ ITEM) -print("Estimated Marginal Means:") -print(item_emmeans) -print("\nPairwise Contrasts:") -item_contrasts <- pairs(item_emmeans, adjust = "bonferroni") -print(item_contrasts) - - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - - -# TIME × ITEM Interaction -print("\n=== TIME × ITEM INTERACTION ===") -time_item_emmeans <- emmeans(mixed_anova_model, ~ TIME * ITEM) -print("Estimated Marginal Means:") -print(time_item_emmeans) - -print("\nSimple Effects of ITEM within each TIME:") -time_item_simple <- pairs(time_item_emmeans, by = "TIME", adjust = "bonferroni") -print(time_item_simple) - -print("\nSimple Effects of TIME within each ITEM:") -time_item_simple2 <- pairs(time_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(time_item_simple2) - - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each ITEM -time_item_simple2_df <- as.data.frame(time_item_simple2) -calculate_cohens_d_for_pairs(time_item_simple2_df, long_data_clean, "TIME", "ITEM", "MEAN_DIFFERENCE") - -# Get simple effects of ITEM within each TIME -time_item_simple_df <- as.data.frame(time_item_simple) -calculate_cohens_d_for_pairs(time_item_simple_df, long_data_clean, "ITEM", "TIME", "MEAN_DIFFERENCE") - - diff --git a/.history/eohi1/mixed anova - personality_20250916123639.r b/.history/eohi1/mixed anova - personality_20250916123639.r deleted file mode 100644 index 70d9919..0000000 --- a/.history/eohi1/mixed anova - personality_20250916123639.r +++ /dev/null @@ -1,765 +0,0 @@ -# Mixed ANOVA Analysis for Personality Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_pers_extravert, NPastDiff_pers_critical, NPastDiff_pers_dependable, NPastDiff_pers_anxious, NPastDiff_pers_complex -# NFutDiff_pers_extravert, NFutDiff_pers_critical, NFutDiff_pers_dependable, NFutDiff_pers_anxious, NFutDiff_pers_complex - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", "NPastDiff_pers_anxious", "NPastDiff_pers_complex", - "NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", "NFutDiff_pers_anxious", "NFutDiff_pers_complex") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required personality item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", "NPastDiff_pers_anxious", "NPastDiff_pers_complex", - "NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", "NFutDiff_pers_anxious", "NFutDiff_pers_complex"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("extravert", "critical", "dependable", "anxious", "complex"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("extravert", "critical", "dependable", "anxious", "complex")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × ITEM) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × ITEM combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, ITEM, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × ITEM combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, ITEM, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + ITEM, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of ITEM levels:", length(levels(long_data_clean$ITEM)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 10, # 2 TIME × 5 ITEM = 10 - missing_combinations = 10 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × ITEM (5 levels: read, music, tv, nap, travel) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * ITEM + Error(pID/(TIME * ITEM)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "ITEM")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for ITEM (5 levels - within-subjects) -print("Mauchly's Test of Sphericity for ITEM:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_item <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = ITEM, - type = 3, - detailed = TRUE) - - print("ITEM Sphericity Test:") - print(ez_item$Mauchly) - -}, error = function(e) { - print(paste("Error in ITEM sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × ITEM interaction -print("\nMauchly's Test of Sphericity for TIME × ITEM Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - - print("TIME × ITEM Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × ITEM sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, ITEM), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to ITEM effects:") - - # ITEM main effect (DFn = 4, DFd = 4244) - item_df_corrected_gg <- 4 * epsilon_gg - item_df_corrected_hf <- 4 * epsilon_hf - - print(paste("ITEM: Original df = 4, 4244")) - print(paste("ITEM: GG corrected df =", round(item_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("ITEM: HF corrected df =", round(item_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # TIME × ITEM interaction (DFn = 4, DFd = 4244) - interaction_df_corrected_gg <- 4 * epsilon_gg - interaction_df_corrected_hf <- 4 * epsilon_hf - - print(paste("TIME × ITEM: Original df = 4, 4244")) - print(paste("TIME × ITEM: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("TIME × ITEM: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of ITEM -print("\nMain Effect of ITEM:") -item_emmeans <- emmeans(mixed_anova_model, ~ ITEM) -print("Estimated Marginal Means:") -print(item_emmeans) -print("\nPairwise Contrasts:") -item_contrasts <- pairs(item_emmeans, adjust = "bonferroni") -print(item_contrasts) - - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - - -# TIME × ITEM Interaction -print("\n=== TIME × ITEM INTERACTION ===") -time_item_emmeans <- emmeans(mixed_anova_model, ~ TIME * ITEM) -print("Estimated Marginal Means:") -print(time_item_emmeans) - -print("\nSimple Effects of ITEM within each TIME:") -time_item_simple <- pairs(time_item_emmeans, by = "TIME", adjust = "bonferroni") -print(time_item_simple) - -print("\nSimple Effects of TIME within each ITEM:") -time_item_simple2 <- pairs(time_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(time_item_simple2) - - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each ITEM -time_item_simple2_df <- as.data.frame(time_item_simple2) -calculate_cohens_d_for_pairs(time_item_simple2_df, long_data_clean, "TIME", "ITEM", "MEAN_DIFFERENCE") - -# Get simple effects of ITEM within each TIME -time_item_simple_df <- as.data.frame(time_item_simple) -calculate_cohens_d_for_pairs(time_item_simple_df, long_data_clean, "ITEM", "TIME", "MEAN_DIFFERENCE") - - diff --git a/.history/eohi1/mixed anova - personality_20250916123640.r b/.history/eohi1/mixed anova - personality_20250916123640.r deleted file mode 100644 index 70d9919..0000000 --- a/.history/eohi1/mixed anova - personality_20250916123640.r +++ /dev/null @@ -1,765 +0,0 @@ -# Mixed ANOVA Analysis for Personality Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_pers_extravert, NPastDiff_pers_critical, NPastDiff_pers_dependable, NPastDiff_pers_anxious, NPastDiff_pers_complex -# NFutDiff_pers_extravert, NFutDiff_pers_critical, NFutDiff_pers_dependable, NFutDiff_pers_anxious, NFutDiff_pers_complex - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", "NPastDiff_pers_anxious", "NPastDiff_pers_complex", - "NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", "NFutDiff_pers_anxious", "NFutDiff_pers_complex") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required personality item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", "NPastDiff_pers_anxious", "NPastDiff_pers_complex", - "NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", "NFutDiff_pers_anxious", "NFutDiff_pers_complex"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("extravert", "critical", "dependable", "anxious", "complex"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("extravert", "critical", "dependable", "anxious", "complex")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × ITEM) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × ITEM combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, ITEM, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × ITEM combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, ITEM, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + ITEM, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of ITEM levels:", length(levels(long_data_clean$ITEM)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 10, # 2 TIME × 5 ITEM = 10 - missing_combinations = 10 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × ITEM (5 levels: read, music, tv, nap, travel) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * ITEM + Error(pID/(TIME * ITEM)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "ITEM")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for ITEM (5 levels - within-subjects) -print("Mauchly's Test of Sphericity for ITEM:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_item <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = ITEM, - type = 3, - detailed = TRUE) - - print("ITEM Sphericity Test:") - print(ez_item$Mauchly) - -}, error = function(e) { - print(paste("Error in ITEM sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × ITEM interaction -print("\nMauchly's Test of Sphericity for TIME × ITEM Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - - print("TIME × ITEM Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × ITEM sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, ITEM), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to ITEM effects:") - - # ITEM main effect (DFn = 4, DFd = 4244) - item_df_corrected_gg <- 4 * epsilon_gg - item_df_corrected_hf <- 4 * epsilon_hf - - print(paste("ITEM: Original df = 4, 4244")) - print(paste("ITEM: GG corrected df =", round(item_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("ITEM: HF corrected df =", round(item_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # TIME × ITEM interaction (DFn = 4, DFd = 4244) - interaction_df_corrected_gg <- 4 * epsilon_gg - interaction_df_corrected_hf <- 4 * epsilon_hf - - print(paste("TIME × ITEM: Original df = 4, 4244")) - print(paste("TIME × ITEM: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("TIME × ITEM: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of ITEM -print("\nMain Effect of ITEM:") -item_emmeans <- emmeans(mixed_anova_model, ~ ITEM) -print("Estimated Marginal Means:") -print(item_emmeans) -print("\nPairwise Contrasts:") -item_contrasts <- pairs(item_emmeans, adjust = "bonferroni") -print(item_contrasts) - - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - - -# TIME × ITEM Interaction -print("\n=== TIME × ITEM INTERACTION ===") -time_item_emmeans <- emmeans(mixed_anova_model, ~ TIME * ITEM) -print("Estimated Marginal Means:") -print(time_item_emmeans) - -print("\nSimple Effects of ITEM within each TIME:") -time_item_simple <- pairs(time_item_emmeans, by = "TIME", adjust = "bonferroni") -print(time_item_simple) - -print("\nSimple Effects of TIME within each ITEM:") -time_item_simple2 <- pairs(time_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(time_item_simple2) - - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each ITEM -time_item_simple2_df <- as.data.frame(time_item_simple2) -calculate_cohens_d_for_pairs(time_item_simple2_df, long_data_clean, "TIME", "ITEM", "MEAN_DIFFERENCE") - -# Get simple effects of ITEM within each TIME -time_item_simple_df <- as.data.frame(time_item_simple) -calculate_cohens_d_for_pairs(time_item_simple_df, long_data_clean, "ITEM", "TIME", "MEAN_DIFFERENCE") - - diff --git a/.history/eohi1/mixed anova - personality_20250916125522.r b/.history/eohi1/mixed anova - personality_20250916125522.r deleted file mode 100644 index 70d9919..0000000 --- a/.history/eohi1/mixed anova - personality_20250916125522.r +++ /dev/null @@ -1,765 +0,0 @@ -# Mixed ANOVA Analysis for Personality Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_pers_extravert, NPastDiff_pers_critical, NPastDiff_pers_dependable, NPastDiff_pers_anxious, NPastDiff_pers_complex -# NFutDiff_pers_extravert, NFutDiff_pers_critical, NFutDiff_pers_dependable, NFutDiff_pers_anxious, NFutDiff_pers_complex - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", "NPastDiff_pers_anxious", "NPastDiff_pers_complex", - "NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", "NFutDiff_pers_anxious", "NFutDiff_pers_complex") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required personality item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", "NPastDiff_pers_anxious", "NPastDiff_pers_complex", - "NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", "NFutDiff_pers_anxious", "NFutDiff_pers_complex"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("extravert", "critical", "dependable", "anxious", "complex"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("extravert", "critical", "dependable", "anxious", "complex")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × ITEM) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × ITEM combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, ITEM, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × ITEM combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, ITEM, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + ITEM, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of ITEM levels:", length(levels(long_data_clean$ITEM)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 10, # 2 TIME × 5 ITEM = 10 - missing_combinations = 10 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × ITEM (5 levels: read, music, tv, nap, travel) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * ITEM + Error(pID/(TIME * ITEM)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "ITEM")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for ITEM (5 levels - within-subjects) -print("Mauchly's Test of Sphericity for ITEM:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_item <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = ITEM, - type = 3, - detailed = TRUE) - - print("ITEM Sphericity Test:") - print(ez_item$Mauchly) - -}, error = function(e) { - print(paste("Error in ITEM sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × ITEM interaction -print("\nMauchly's Test of Sphericity for TIME × ITEM Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - - print("TIME × ITEM Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × ITEM sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, ITEM), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to ITEM effects:") - - # ITEM main effect (DFn = 4, DFd = 4244) - item_df_corrected_gg <- 4 * epsilon_gg - item_df_corrected_hf <- 4 * epsilon_hf - - print(paste("ITEM: Original df = 4, 4244")) - print(paste("ITEM: GG corrected df =", round(item_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("ITEM: HF corrected df =", round(item_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # TIME × ITEM interaction (DFn = 4, DFd = 4244) - interaction_df_corrected_gg <- 4 * epsilon_gg - interaction_df_corrected_hf <- 4 * epsilon_hf - - print(paste("TIME × ITEM: Original df = 4, 4244")) - print(paste("TIME × ITEM: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("TIME × ITEM: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of ITEM -print("\nMain Effect of ITEM:") -item_emmeans <- emmeans(mixed_anova_model, ~ ITEM) -print("Estimated Marginal Means:") -print(item_emmeans) -print("\nPairwise Contrasts:") -item_contrasts <- pairs(item_emmeans, adjust = "bonferroni") -print(item_contrasts) - - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - - -# TIME × ITEM Interaction -print("\n=== TIME × ITEM INTERACTION ===") -time_item_emmeans <- emmeans(mixed_anova_model, ~ TIME * ITEM) -print("Estimated Marginal Means:") -print(time_item_emmeans) - -print("\nSimple Effects of ITEM within each TIME:") -time_item_simple <- pairs(time_item_emmeans, by = "TIME", adjust = "bonferroni") -print(time_item_simple) - -print("\nSimple Effects of TIME within each ITEM:") -time_item_simple2 <- pairs(time_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(time_item_simple2) - - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each ITEM -time_item_simple2_df <- as.data.frame(time_item_simple2) -calculate_cohens_d_for_pairs(time_item_simple2_df, long_data_clean, "TIME", "ITEM", "MEAN_DIFFERENCE") - -# Get simple effects of ITEM within each TIME -time_item_simple_df <- as.data.frame(time_item_simple) -calculate_cohens_d_for_pairs(time_item_simple_df, long_data_clean, "ITEM", "TIME", "MEAN_DIFFERENCE") - - diff --git a/.history/eohi1/mixed anova - personality_20250916130413.r b/.history/eohi1/mixed anova - personality_20250916130413.r deleted file mode 100644 index 70d9919..0000000 --- a/.history/eohi1/mixed anova - personality_20250916130413.r +++ /dev/null @@ -1,765 +0,0 @@ -# Mixed ANOVA Analysis for Personality Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_pers_extravert, NPastDiff_pers_critical, NPastDiff_pers_dependable, NPastDiff_pers_anxious, NPastDiff_pers_complex -# NFutDiff_pers_extravert, NFutDiff_pers_critical, NFutDiff_pers_dependable, NFutDiff_pers_anxious, NFutDiff_pers_complex - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", "NPastDiff_pers_anxious", "NPastDiff_pers_complex", - "NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", "NFutDiff_pers_anxious", "NFutDiff_pers_complex") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required personality item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", "NPastDiff_pers_anxious", "NPastDiff_pers_complex", - "NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", "NFutDiff_pers_anxious", "NFutDiff_pers_complex"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("extravert", "critical", "dependable", "anxious", "complex"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("extravert", "critical", "dependable", "anxious", "complex")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × ITEM) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × ITEM combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, ITEM, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × ITEM combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, ITEM, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + ITEM, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of ITEM levels:", length(levels(long_data_clean$ITEM)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 10, # 2 TIME × 5 ITEM = 10 - missing_combinations = 10 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × ITEM (5 levels: read, music, tv, nap, travel) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * ITEM + Error(pID/(TIME * ITEM)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "ITEM")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for ITEM (5 levels - within-subjects) -print("Mauchly's Test of Sphericity for ITEM:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_item <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = ITEM, - type = 3, - detailed = TRUE) - - print("ITEM Sphericity Test:") - print(ez_item$Mauchly) - -}, error = function(e) { - print(paste("Error in ITEM sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × ITEM interaction -print("\nMauchly's Test of Sphericity for TIME × ITEM Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - - print("TIME × ITEM Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × ITEM sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, ITEM), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to ITEM effects:") - - # ITEM main effect (DFn = 4, DFd = 4244) - item_df_corrected_gg <- 4 * epsilon_gg - item_df_corrected_hf <- 4 * epsilon_hf - - print(paste("ITEM: Original df = 4, 4244")) - print(paste("ITEM: GG corrected df =", round(item_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("ITEM: HF corrected df =", round(item_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # TIME × ITEM interaction (DFn = 4, DFd = 4244) - interaction_df_corrected_gg <- 4 * epsilon_gg - interaction_df_corrected_hf <- 4 * epsilon_hf - - print(paste("TIME × ITEM: Original df = 4, 4244")) - print(paste("TIME × ITEM: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("TIME × ITEM: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of ITEM -print("\nMain Effect of ITEM:") -item_emmeans <- emmeans(mixed_anova_model, ~ ITEM) -print("Estimated Marginal Means:") -print(item_emmeans) -print("\nPairwise Contrasts:") -item_contrasts <- pairs(item_emmeans, adjust = "bonferroni") -print(item_contrasts) - - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - - -# TIME × ITEM Interaction -print("\n=== TIME × ITEM INTERACTION ===") -time_item_emmeans <- emmeans(mixed_anova_model, ~ TIME * ITEM) -print("Estimated Marginal Means:") -print(time_item_emmeans) - -print("\nSimple Effects of ITEM within each TIME:") -time_item_simple <- pairs(time_item_emmeans, by = "TIME", adjust = "bonferroni") -print(time_item_simple) - -print("\nSimple Effects of TIME within each ITEM:") -time_item_simple2 <- pairs(time_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(time_item_simple2) - - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each ITEM -time_item_simple2_df <- as.data.frame(time_item_simple2) -calculate_cohens_d_for_pairs(time_item_simple2_df, long_data_clean, "TIME", "ITEM", "MEAN_DIFFERENCE") - -# Get simple effects of ITEM within each TIME -time_item_simple_df <- as.data.frame(time_item_simple) -calculate_cohens_d_for_pairs(time_item_simple_df, long_data_clean, "ITEM", "TIME", "MEAN_DIFFERENCE") - - diff --git a/.history/eohi1/mixed anova - personality_20250917121011.r b/.history/eohi1/mixed anova - personality_20250917121011.r deleted file mode 100644 index 70d9919..0000000 --- a/.history/eohi1/mixed anova - personality_20250917121011.r +++ /dev/null @@ -1,765 +0,0 @@ -# Mixed ANOVA Analysis for Personality Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_pers_extravert, NPastDiff_pers_critical, NPastDiff_pers_dependable, NPastDiff_pers_anxious, NPastDiff_pers_complex -# NFutDiff_pers_extravert, NFutDiff_pers_critical, NFutDiff_pers_dependable, NFutDiff_pers_anxious, NFutDiff_pers_complex - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", "NPastDiff_pers_anxious", "NPastDiff_pers_complex", - "NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", "NFutDiff_pers_anxious", "NFutDiff_pers_complex") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required personality item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", "NPastDiff_pers_anxious", "NPastDiff_pers_complex", - "NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", "NFutDiff_pers_anxious", "NFutDiff_pers_complex"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("extravert", "critical", "dependable", "anxious", "complex"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("extravert", "critical", "dependable", "anxious", "complex")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × ITEM) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × ITEM combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, ITEM, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × ITEM combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, ITEM, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + ITEM, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of ITEM levels:", length(levels(long_data_clean$ITEM)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 10, # 2 TIME × 5 ITEM = 10 - missing_combinations = 10 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × ITEM (5 levels: read, music, tv, nap, travel) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * ITEM + Error(pID/(TIME * ITEM)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "ITEM")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for ITEM (5 levels - within-subjects) -print("Mauchly's Test of Sphericity for ITEM:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_item <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = ITEM, - type = 3, - detailed = TRUE) - - print("ITEM Sphericity Test:") - print(ez_item$Mauchly) - -}, error = function(e) { - print(paste("Error in ITEM sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × ITEM interaction -print("\nMauchly's Test of Sphericity for TIME × ITEM Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - - print("TIME × ITEM Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × ITEM sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, ITEM), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to ITEM effects:") - - # ITEM main effect (DFn = 4, DFd = 4244) - item_df_corrected_gg <- 4 * epsilon_gg - item_df_corrected_hf <- 4 * epsilon_hf - - print(paste("ITEM: Original df = 4, 4244")) - print(paste("ITEM: GG corrected df =", round(item_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("ITEM: HF corrected df =", round(item_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # TIME × ITEM interaction (DFn = 4, DFd = 4244) - interaction_df_corrected_gg <- 4 * epsilon_gg - interaction_df_corrected_hf <- 4 * epsilon_hf - - print(paste("TIME × ITEM: Original df = 4, 4244")) - print(paste("TIME × ITEM: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("TIME × ITEM: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of ITEM -print("\nMain Effect of ITEM:") -item_emmeans <- emmeans(mixed_anova_model, ~ ITEM) -print("Estimated Marginal Means:") -print(item_emmeans) -print("\nPairwise Contrasts:") -item_contrasts <- pairs(item_emmeans, adjust = "bonferroni") -print(item_contrasts) - - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - - -# TIME × ITEM Interaction -print("\n=== TIME × ITEM INTERACTION ===") -time_item_emmeans <- emmeans(mixed_anova_model, ~ TIME * ITEM) -print("Estimated Marginal Means:") -print(time_item_emmeans) - -print("\nSimple Effects of ITEM within each TIME:") -time_item_simple <- pairs(time_item_emmeans, by = "TIME", adjust = "bonferroni") -print(time_item_simple) - -print("\nSimple Effects of TIME within each ITEM:") -time_item_simple2 <- pairs(time_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(time_item_simple2) - - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each ITEM -time_item_simple2_df <- as.data.frame(time_item_simple2) -calculate_cohens_d_for_pairs(time_item_simple2_df, long_data_clean, "TIME", "ITEM", "MEAN_DIFFERENCE") - -# Get simple effects of ITEM within each TIME -time_item_simple_df <- as.data.frame(time_item_simple) -calculate_cohens_d_for_pairs(time_item_simple_df, long_data_clean, "ITEM", "TIME", "MEAN_DIFFERENCE") - - diff --git a/.history/eohi1/mixed anova - preferences_20250916113624.r b/.history/eohi1/mixed anova - preferences_20250916113624.r deleted file mode 100644 index 8832371..0000000 --- a/.history/eohi1/mixed anova - preferences_20250916113624.r +++ /dev/null @@ -1,836 +0,0 @@ -# Mixed ANOVA Analysis for Preference Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_pref_read, NPastDiff_pref_music, NPastDiff_pref_tv, NPastDiff_pref_nap, NPastDiff_pref_travel -# NFutDiff_pref_read, NFutDiff_pref_music, NFutDiff_pref_tv, NFutDiff_pref_nap, NFutDiff_pref_travel - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required preference item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("read", "music", "tv", "nap", "travel"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("read", "music", "tv", "nap", "travel")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and DOMAIN:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - preferences_20250916113646.r b/.history/eohi1/mixed anova - preferences_20250916113646.r deleted file mode 100644 index f44be6b..0000000 --- a/.history/eohi1/mixed anova - preferences_20250916113646.r +++ /dev/null @@ -1,836 +0,0 @@ -# Mixed ANOVA Analysis for Preference Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_pref_read, NPastDiff_pref_music, NPastDiff_pref_tv, NPastDiff_pref_nap, NPastDiff_pref_travel -# NFutDiff_pref_read, NFutDiff_pref_music, NFutDiff_pref_tv, NFutDiff_pref_nap, NFutDiff_pref_travel - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required preference item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("read", "music", "tv", "nap", "travel"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("read", "music", "tv", "nap", "travel")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × DOMAIN) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × DOMAIN combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + DOMAIN, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of DOMAIN levels:", length(levels(long_data_clean$DOMAIN)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 8, # 2 TIME × 4 DOMAIN = 8 - missing_combinations = 8 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - preferences_20250916113702.r b/.history/eohi1/mixed anova - preferences_20250916113702.r deleted file mode 100644 index 8f0c8d8..0000000 --- a/.history/eohi1/mixed anova - preferences_20250916113702.r +++ /dev/null @@ -1,836 +0,0 @@ -# Mixed ANOVA Analysis for Preference Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_pref_read, NPastDiff_pref_music, NPastDiff_pref_tv, NPastDiff_pref_nap, NPastDiff_pref_travel -# NFutDiff_pref_read, NFutDiff_pref_music, NFutDiff_pref_tv, NFutDiff_pref_nap, NFutDiff_pref_travel - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required preference item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("read", "music", "tv", "nap", "travel"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("read", "music", "tv", "nap", "travel")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × ITEM) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × ITEM combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, ITEM, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × ITEM combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, ITEM, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + ITEM, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of ITEM levels:", length(levels(long_data_clean$ITEM)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 10, # 2 TIME × 5 ITEM = 10 - missing_combinations = 10 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (4 levels: Preferences, Personality, Values, Life) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * DOMAIN + Error(pID/(TIME * DOMAIN)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "DOMAIN")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for DOMAIN (4 levels - within-subjects) -print("Mauchly's Test of Sphericity for DOMAIN:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_domain <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = DOMAIN, - type = 3, - detailed = TRUE) - - print("DOMAIN Sphericity Test:") - print(ez_domain$Mauchly) - -}, error = function(e) { - print(paste("Error in DOMAIN sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × DOMAIN interaction -print("\nMauchly's Test of Sphericity for TIME × DOMAIN Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - - print("TIME × DOMAIN Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × DOMAIN sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, DOMAIN), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - preferences_20250916113720.r b/.history/eohi1/mixed anova - preferences_20250916113720.r deleted file mode 100644 index 1b494b0..0000000 --- a/.history/eohi1/mixed anova - preferences_20250916113720.r +++ /dev/null @@ -1,836 +0,0 @@ -# Mixed ANOVA Analysis for Preference Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_pref_read, NPastDiff_pref_music, NPastDiff_pref_tv, NPastDiff_pref_nap, NPastDiff_pref_travel -# NFutDiff_pref_read, NFutDiff_pref_music, NFutDiff_pref_tv, NFutDiff_pref_nap, NFutDiff_pref_travel - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required preference item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("read", "music", "tv", "nap", "travel"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("read", "music", "tv", "nap", "travel")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × ITEM) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × ITEM combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, ITEM, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × ITEM combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, ITEM, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + ITEM, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of ITEM levels:", length(levels(long_data_clean$ITEM)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 10, # 2 TIME × 5 ITEM = 10 - missing_combinations = 10 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × ITEM (5 levels: read, music, tv, nap, travel) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * ITEM + Error(pID/(TIME * ITEM)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "ITEM")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for ITEM (5 levels - within-subjects) -print("Mauchly's Test of Sphericity for ITEM:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_item <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = ITEM, - type = 3, - detailed = TRUE) - - print("ITEM Sphericity Test:") - print(ez_item$Mauchly) - -}, error = function(e) { - print(paste("Error in ITEM sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × ITEM interaction -print("\nMauchly's Test of Sphericity for TIME × ITEM Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - - print("TIME × ITEM Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × ITEM sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, DOMAIN, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, DOMAIN), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to DOMAIN effects:") - - # DOMAIN main effect (DFn = 3, DFd = 3183) - domain_df_corrected_gg <- 3 * epsilon_gg - domain_df_corrected_hf <- 3 * epsilon_hf - - print(paste("DOMAIN: Original df = 3, 3183")) - print(paste("DOMAIN: GG corrected df =", round(domain_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("DOMAIN: HF corrected df =", round(domain_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # TIME × DOMAIN interaction (DFn = 3, DFd = 3183) - interaction_df_corrected_gg <- 3 * epsilon_gg - interaction_df_corrected_hf <- 3 * epsilon_hf - - print(paste("TIME × DOMAIN: Original df = 3, 3183")) - print(paste("TIME × DOMAIN: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(3183 * epsilon_gg, 2))) - print(paste("TIME × DOMAIN: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(3183 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(mixed_anova_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × DOMAIN Interaction (Significant: p = 0.012) -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(mixed_anova_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TEMPORAL_DO × DOMAIN Interaction (Marginal: p = 0.058) -print("\n=== TEMPORAL_DO × DOMAIN INTERACTION ===") -temporal_domain_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * DOMAIN) -print("Estimated Marginal Means:") -print(temporal_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") -temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_domain_simple) - -print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") -temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(temporal_domain_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - preferences_20250916113743.r b/.history/eohi1/mixed anova - preferences_20250916113743.r deleted file mode 100644 index 9acfb2a..0000000 --- a/.history/eohi1/mixed anova - preferences_20250916113743.r +++ /dev/null @@ -1,836 +0,0 @@ -# Mixed ANOVA Analysis for Preference Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_pref_read, NPastDiff_pref_music, NPastDiff_pref_tv, NPastDiff_pref_nap, NPastDiff_pref_travel -# NFutDiff_pref_read, NFutDiff_pref_music, NFutDiff_pref_tv, NFutDiff_pref_nap, NFutDiff_pref_travel - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required preference item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("read", "music", "tv", "nap", "travel"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("read", "music", "tv", "nap", "travel")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × ITEM) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × ITEM combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, ITEM, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × ITEM combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, ITEM, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + ITEM, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of ITEM levels:", length(levels(long_data_clean$ITEM)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 10, # 2 TIME × 5 ITEM = 10 - missing_combinations = 10 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × ITEM (5 levels: read, music, tv, nap, travel) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * ITEM + Error(pID/(TIME * ITEM)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "ITEM")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for ITEM (5 levels - within-subjects) -print("Mauchly's Test of Sphericity for ITEM:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_item <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = ITEM, - type = 3, - detailed = TRUE) - - print("ITEM Sphericity Test:") - print(ez_item$Mauchly) - -}, error = function(e) { - print(paste("Error in ITEM sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × ITEM interaction -print("\nMauchly's Test of Sphericity for TIME × ITEM Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - - print("TIME × ITEM Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × ITEM sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, ITEM), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to ITEM effects:") - - # ITEM main effect (DFn = 4, DFd = 4244) - item_df_corrected_gg <- 4 * epsilon_gg - item_df_corrected_hf <- 4 * epsilon_hf - - print(paste("ITEM: Original df = 4, 4244")) - print(paste("ITEM: GG corrected df =", round(item_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("ITEM: HF corrected df =", round(item_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # TIME × ITEM interaction (DFn = 4, DFd = 4244) - interaction_df_corrected_gg <- 4 * epsilon_gg - interaction_df_corrected_hf <- 4 * epsilon_hf - - print(paste("TIME × ITEM: Original df = 4, 4244")) - print(paste("TIME × ITEM: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("TIME × ITEM: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of ITEM -print("\nMain Effect of ITEM:") -item_emmeans <- emmeans(mixed_anova_model, ~ ITEM) -print("Estimated Marginal Means:") -print(item_emmeans) -print("\nPairwise Contrasts:") -item_contrasts <- pairs(item_emmeans, adjust = "bonferroni") -print(item_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × ITEM Interaction -print("\n=== TIME × ITEM INTERACTION ===") -time_item_emmeans <- emmeans(mixed_anova_model, ~ TIME * ITEM) -print("Estimated Marginal Means:") -print(time_item_emmeans) - -print("\nSimple Effects of ITEM within each TIME:") -time_item_simple <- pairs(time_item_emmeans, by = "TIME", adjust = "bonferroni") -print(time_item_simple) - -print("\nSimple Effects of TIME within each ITEM:") -time_item_simple2 <- pairs(time_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(time_item_simple2) - -# TEMPORAL_DO × ITEM Interaction -print("\n=== TEMPORAL_DO × ITEM INTERACTION ===") -temporal_item_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * ITEM) -print("Estimated Marginal Means:") -print(temporal_item_emmeans) - -print("\nSimple Effects of ITEM within each TEMPORAL_DO:") -temporal_item_simple <- pairs(temporal_item_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_item_simple) - -print("\nSimple Effects of TEMPORAL_DO within each ITEM:") -temporal_item_simple2 <- pairs(temporal_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(temporal_item_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * DOMAIN) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of DOMAIN within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - preferences_20250916113752.r b/.history/eohi1/mixed anova - preferences_20250916113752.r deleted file mode 100644 index 3bed516..0000000 --- a/.history/eohi1/mixed anova - preferences_20250916113752.r +++ /dev/null @@ -1,836 +0,0 @@ -# Mixed ANOVA Analysis for Preference Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_pref_read, NPastDiff_pref_music, NPastDiff_pref_tv, NPastDiff_pref_nap, NPastDiff_pref_travel -# NFutDiff_pref_read, NFutDiff_pref_music, NFutDiff_pref_tv, NFutDiff_pref_nap, NFutDiff_pref_travel - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required preference item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("read", "music", "tv", "nap", "travel"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("read", "music", "tv", "nap", "travel")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × ITEM) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × ITEM combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, ITEM, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × ITEM combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, ITEM, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + ITEM, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of ITEM levels:", length(levels(long_data_clean$ITEM)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 10, # 2 TIME × 5 ITEM = 10 - missing_combinations = 10 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × ITEM (5 levels: read, music, tv, nap, travel) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * ITEM + Error(pID/(TIME * ITEM)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "ITEM")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for ITEM (5 levels - within-subjects) -print("Mauchly's Test of Sphericity for ITEM:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_item <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = ITEM, - type = 3, - detailed = TRUE) - - print("ITEM Sphericity Test:") - print(ez_item$Mauchly) - -}, error = function(e) { - print(paste("Error in ITEM sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × ITEM interaction -print("\nMauchly's Test of Sphericity for TIME × ITEM Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - - print("TIME × ITEM Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × ITEM sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, ITEM), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to ITEM effects:") - - # ITEM main effect (DFn = 4, DFd = 4244) - item_df_corrected_gg <- 4 * epsilon_gg - item_df_corrected_hf <- 4 * epsilon_hf - - print(paste("ITEM: Original df = 4, 4244")) - print(paste("ITEM: GG corrected df =", round(item_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("ITEM: HF corrected df =", round(item_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # TIME × ITEM interaction (DFn = 4, DFd = 4244) - interaction_df_corrected_gg <- 4 * epsilon_gg - interaction_df_corrected_hf <- 4 * epsilon_hf - - print(paste("TIME × ITEM: Original df = 4, 4244")) - print(paste("TIME × ITEM: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("TIME × ITEM: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of ITEM -print("\nMain Effect of ITEM:") -item_emmeans <- emmeans(mixed_anova_model, ~ ITEM) -print("Estimated Marginal Means:") -print(item_emmeans) -print("\nPairwise Contrasts:") -item_contrasts <- pairs(item_emmeans, adjust = "bonferroni") -print(item_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × ITEM Interaction -print("\n=== TIME × ITEM INTERACTION ===") -time_item_emmeans <- emmeans(mixed_anova_model, ~ TIME * ITEM) -print("Estimated Marginal Means:") -print(time_item_emmeans) - -print("\nSimple Effects of ITEM within each TIME:") -time_item_simple <- pairs(time_item_emmeans, by = "TIME", adjust = "bonferroni") -print(time_item_simple) - -print("\nSimple Effects of TIME within each ITEM:") -time_item_simple2 <- pairs(time_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(time_item_simple2) - -# TEMPORAL_DO × ITEM Interaction -print("\n=== TEMPORAL_DO × ITEM INTERACTION ===") -temporal_item_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * ITEM) -print("Estimated Marginal Means:") -print(temporal_item_emmeans) - -print("\nSimple Effects of ITEM within each TEMPORAL_DO:") -temporal_item_simple <- pairs(temporal_item_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_item_simple) - -print("\nSimple Effects of TEMPORAL_DO within each ITEM:") -temporal_item_simple2 <- pairs(temporal_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(temporal_item_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * ITEM) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of ITEM within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - preferences_20250916113803.r b/.history/eohi1/mixed anova - preferences_20250916113803.r deleted file mode 100644 index 3bed516..0000000 --- a/.history/eohi1/mixed anova - preferences_20250916113803.r +++ /dev/null @@ -1,836 +0,0 @@ -# Mixed ANOVA Analysis for Preference Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_pref_read, NPastDiff_pref_music, NPastDiff_pref_tv, NPastDiff_pref_nap, NPastDiff_pref_travel -# NFutDiff_pref_read, NFutDiff_pref_music, NFutDiff_pref_tv, NFutDiff_pref_nap, NFutDiff_pref_travel - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required preference item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("read", "music", "tv", "nap", "travel"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("read", "music", "tv", "nap", "travel")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × ITEM) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × ITEM combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, ITEM, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × ITEM combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, ITEM, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + ITEM, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of ITEM levels:", length(levels(long_data_clean$ITEM)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 10, # 2 TIME × 5 ITEM = 10 - missing_combinations = 10 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × ITEM (5 levels: read, music, tv, nap, travel) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * ITEM + Error(pID/(TIME * ITEM)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "ITEM")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for ITEM (5 levels - within-subjects) -print("Mauchly's Test of Sphericity for ITEM:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_item <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = ITEM, - type = 3, - detailed = TRUE) - - print("ITEM Sphericity Test:") - print(ez_item$Mauchly) - -}, error = function(e) { - print(paste("Error in ITEM sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × ITEM interaction -print("\nMauchly's Test of Sphericity for TIME × ITEM Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - - print("TIME × ITEM Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × ITEM sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, ITEM), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to ITEM effects:") - - # ITEM main effect (DFn = 4, DFd = 4244) - item_df_corrected_gg <- 4 * epsilon_gg - item_df_corrected_hf <- 4 * epsilon_hf - - print(paste("ITEM: Original df = 4, 4244")) - print(paste("ITEM: GG corrected df =", round(item_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("ITEM: HF corrected df =", round(item_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # TIME × ITEM interaction (DFn = 4, DFd = 4244) - interaction_df_corrected_gg <- 4 * epsilon_gg - interaction_df_corrected_hf <- 4 * epsilon_hf - - print(paste("TIME × ITEM: Original df = 4, 4244")) - print(paste("TIME × ITEM: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("TIME × ITEM: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of ITEM -print("\nMain Effect of ITEM:") -item_emmeans <- emmeans(mixed_anova_model, ~ ITEM) -print("Estimated Marginal Means:") -print(item_emmeans) -print("\nPairwise Contrasts:") -item_contrasts <- pairs(item_emmeans, adjust = "bonferroni") -print(item_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × ITEM Interaction -print("\n=== TIME × ITEM INTERACTION ===") -time_item_emmeans <- emmeans(mixed_anova_model, ~ TIME * ITEM) -print("Estimated Marginal Means:") -print(time_item_emmeans) - -print("\nSimple Effects of ITEM within each TIME:") -time_item_simple <- pairs(time_item_emmeans, by = "TIME", adjust = "bonferroni") -print(time_item_simple) - -print("\nSimple Effects of TIME within each ITEM:") -time_item_simple2 <- pairs(time_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(time_item_simple2) - -# TEMPORAL_DO × ITEM Interaction -print("\n=== TEMPORAL_DO × ITEM INTERACTION ===") -temporal_item_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * ITEM) -print("Estimated Marginal Means:") -print(temporal_item_emmeans) - -print("\nSimple Effects of ITEM within each TEMPORAL_DO:") -temporal_item_simple <- pairs(temporal_item_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_item_simple) - -print("\nSimple Effects of TEMPORAL_DO within each ITEM:") -temporal_item_simple2 <- pairs(temporal_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(temporal_item_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * ITEM) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of ITEM within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - preferences_20250916113806.r b/.history/eohi1/mixed anova - preferences_20250916113806.r deleted file mode 100644 index 3bed516..0000000 --- a/.history/eohi1/mixed anova - preferences_20250916113806.r +++ /dev/null @@ -1,836 +0,0 @@ -# Mixed ANOVA Analysis for Preference Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_pref_read, NPastDiff_pref_music, NPastDiff_pref_tv, NPastDiff_pref_nap, NPastDiff_pref_travel -# NFutDiff_pref_read, NFutDiff_pref_music, NFutDiff_pref_tv, NFutDiff_pref_nap, NFutDiff_pref_travel - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required preference item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("read", "music", "tv", "nap", "travel"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("read", "music", "tv", "nap", "travel")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × ITEM) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × ITEM combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, ITEM, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × ITEM combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, ITEM, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + ITEM, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of ITEM levels:", length(levels(long_data_clean$ITEM)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 10, # 2 TIME × 5 ITEM = 10 - missing_combinations = 10 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × ITEM (5 levels: read, music, tv, nap, travel) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * ITEM + Error(pID/(TIME * ITEM)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "ITEM")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for ITEM (5 levels - within-subjects) -print("Mauchly's Test of Sphericity for ITEM:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_item <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = ITEM, - type = 3, - detailed = TRUE) - - print("ITEM Sphericity Test:") - print(ez_item$Mauchly) - -}, error = function(e) { - print(paste("Error in ITEM sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × ITEM interaction -print("\nMauchly's Test of Sphericity for TIME × ITEM Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - - print("TIME × ITEM Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × ITEM sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, ITEM), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to ITEM effects:") - - # ITEM main effect (DFn = 4, DFd = 4244) - item_df_corrected_gg <- 4 * epsilon_gg - item_df_corrected_hf <- 4 * epsilon_hf - - print(paste("ITEM: Original df = 4, 4244")) - print(paste("ITEM: GG corrected df =", round(item_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("ITEM: HF corrected df =", round(item_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # TIME × ITEM interaction (DFn = 4, DFd = 4244) - interaction_df_corrected_gg <- 4 * epsilon_gg - interaction_df_corrected_hf <- 4 * epsilon_hf - - print(paste("TIME × ITEM: Original df = 4, 4244")) - print(paste("TIME × ITEM: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("TIME × ITEM: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of ITEM -print("\nMain Effect of ITEM:") -item_emmeans <- emmeans(mixed_anova_model, ~ ITEM) -print("Estimated Marginal Means:") -print(item_emmeans) -print("\nPairwise Contrasts:") -item_contrasts <- pairs(item_emmeans, adjust = "bonferroni") -print(item_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × ITEM Interaction -print("\n=== TIME × ITEM INTERACTION ===") -time_item_emmeans <- emmeans(mixed_anova_model, ~ TIME * ITEM) -print("Estimated Marginal Means:") -print(time_item_emmeans) - -print("\nSimple Effects of ITEM within each TIME:") -time_item_simple <- pairs(time_item_emmeans, by = "TIME", adjust = "bonferroni") -print(time_item_simple) - -print("\nSimple Effects of TIME within each ITEM:") -time_item_simple2 <- pairs(time_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(time_item_simple2) - -# TEMPORAL_DO × ITEM Interaction -print("\n=== TEMPORAL_DO × ITEM INTERACTION ===") -temporal_item_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * ITEM) -print("Estimated Marginal Means:") -print(temporal_item_emmeans) - -print("\nSimple Effects of ITEM within each TEMPORAL_DO:") -temporal_item_simple <- pairs(temporal_item_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_item_simple) - -print("\nSimple Effects of TEMPORAL_DO within each ITEM:") -temporal_item_simple2 <- pairs(temporal_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(temporal_item_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * ITEM) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of ITEM within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each DOMAIN -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TIME -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each DOMAIN -temporal_domain_simple2_df <- as.data.frame(temporal_domain_simple2) -calculate_cohens_d_for_pairs(temporal_domain_simple2_df, long_data_clean, "TEMPORAL_DO", "DOMAIN", "MEAN_DIFFERENCE") - -# Get simple effects of DOMAIN within each TEMPORAL_DO -temporal_domain_simple_df <- as.data.frame(temporal_domain_simple) -calculate_cohens_d_for_pairs(temporal_domain_simple_df, long_data_clean, "DOMAIN", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - preferences_20250916120325.r b/.history/eohi1/mixed anova - preferences_20250916120325.r deleted file mode 100644 index 63ad1f1..0000000 --- a/.history/eohi1/mixed anova - preferences_20250916120325.r +++ /dev/null @@ -1,836 +0,0 @@ -# Mixed ANOVA Analysis for Preference Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_pref_read, NPastDiff_pref_music, NPastDiff_pref_tv, NPastDiff_pref_nap, NPastDiff_pref_travel -# NFutDiff_pref_read, NFutDiff_pref_music, NFutDiff_pref_tv, NFutDiff_pref_nap, NFutDiff_pref_travel - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required preference item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("read", "music", "tv", "nap", "travel"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("read", "music", "tv", "nap", "travel")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × ITEM) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × ITEM combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, ITEM, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × ITEM combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, ITEM, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + ITEM, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of ITEM levels:", length(levels(long_data_clean$ITEM)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 10, # 2 TIME × 5 ITEM = 10 - missing_combinations = 10 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × ITEM (5 levels: read, music, tv, nap, travel) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * ITEM + Error(pID/(TIME * ITEM)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "ITEM")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for ITEM (5 levels - within-subjects) -print("Mauchly's Test of Sphericity for ITEM:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_item <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = ITEM, - type = 3, - detailed = TRUE) - - print("ITEM Sphericity Test:") - print(ez_item$Mauchly) - -}, error = function(e) { - print(paste("Error in ITEM sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × ITEM interaction -print("\nMauchly's Test of Sphericity for TIME × ITEM Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - - print("TIME × ITEM Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × ITEM sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, ITEM), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to ITEM effects:") - - # ITEM main effect (DFn = 4, DFd = 4244) - item_df_corrected_gg <- 4 * epsilon_gg - item_df_corrected_hf <- 4 * epsilon_hf - - print(paste("ITEM: Original df = 4, 4244")) - print(paste("ITEM: GG corrected df =", round(item_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("ITEM: HF corrected df =", round(item_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # TIME × ITEM interaction (DFn = 4, DFd = 4244) - interaction_df_corrected_gg <- 4 * epsilon_gg - interaction_df_corrected_hf <- 4 * epsilon_hf - - print(paste("TIME × ITEM: Original df = 4, 4244")) - print(paste("TIME × ITEM: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("TIME × ITEM: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of ITEM -print("\nMain Effect of ITEM:") -item_emmeans <- emmeans(mixed_anova_model, ~ ITEM) -print("Estimated Marginal Means:") -print(item_emmeans) -print("\nPairwise Contrasts:") -item_contrasts <- pairs(item_emmeans, adjust = "bonferroni") -print(item_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - -# TEMPORAL_DO × TIME Interaction (Significant: p = 0.001) -print("\n=== TEMPORAL_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each TEMPORAL_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of TEMPORAL_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# TIME × ITEM Interaction -print("\n=== TIME × ITEM INTERACTION ===") -time_item_emmeans <- emmeans(mixed_anova_model, ~ TIME * ITEM) -print("Estimated Marginal Means:") -print(time_item_emmeans) - -print("\nSimple Effects of ITEM within each TIME:") -time_item_simple <- pairs(time_item_emmeans, by = "TIME", adjust = "bonferroni") -print(time_item_simple) - -print("\nSimple Effects of TIME within each ITEM:") -time_item_simple2 <- pairs(time_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(time_item_simple2) - -# TEMPORAL_DO × ITEM Interaction -print("\n=== TEMPORAL_DO × ITEM INTERACTION ===") -temporal_item_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * ITEM) -print("Estimated Marginal Means:") -print(temporal_item_emmeans) - -print("\nSimple Effects of ITEM within each TEMPORAL_DO:") -temporal_item_simple <- pairs(temporal_item_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") -print(temporal_item_simple) - -print("\nSimple Effects of TEMPORAL_DO within each ITEM:") -temporal_item_simple2 <- pairs(temporal_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(temporal_item_simple2) - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - -# All pairwise comparisons for the three-way interaction -print("\n=== COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS ===") -three_way_emmeans <- emmeans(mixed_anova_model, ~ TEMPORAL_DO * TIME * ITEM) -print("Estimated Marginal Means for all combinations:") -print(three_way_emmeans) - -# Pairwise comparisons within each TEMPORAL_DO × TIME combination -print("\nPairwise comparisons of ITEM within each TEMPORAL_DO × TIME combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("TEMPORAL_DO", "TIME"), adjust = "bonferroni") -print(three_way_contrasts) - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each ITEM -time_item_simple2_df <- as.data.frame(time_item_simple2) -calculate_cohens_d_for_pairs(time_item_simple2_df, long_data_clean, "TIME", "ITEM", "MEAN_DIFFERENCE") - -# Get simple effects of ITEM within each TIME -time_item_simple_df <- as.data.frame(time_item_simple) -calculate_cohens_d_for_pairs(time_item_simple_df, long_data_clean, "ITEM", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each ITEM -temporal_item_simple2_df <- as.data.frame(temporal_item_simple2) -calculate_cohens_d_for_pairs(temporal_item_simple2_df, long_data_clean, "TEMPORAL_DO", "ITEM", "MEAN_DIFFERENCE") - -# Get simple effects of ITEM within each TEMPORAL_DO -temporal_item_simple_df <- as.data.frame(temporal_item_simple) -calculate_cohens_d_for_pairs(temporal_item_simple_df, long_data_clean, "ITEM", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - preferences_20250916120409.r b/.history/eohi1/mixed anova - preferences_20250916120409.r deleted file mode 100644 index 832ad6c..0000000 --- a/.history/eohi1/mixed anova - preferences_20250916120409.r +++ /dev/null @@ -1,795 +0,0 @@ -# Mixed ANOVA Analysis for Preference Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_pref_read, NPastDiff_pref_music, NPastDiff_pref_tv, NPastDiff_pref_nap, NPastDiff_pref_travel -# NFutDiff_pref_read, NFutDiff_pref_music, NFutDiff_pref_tv, NFutDiff_pref_nap, NFutDiff_pref_travel - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required preference item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("read", "music", "tv", "nap", "travel"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("read", "music", "tv", "nap", "travel")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × ITEM) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × ITEM combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, ITEM, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × ITEM combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, ITEM, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + ITEM, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of ITEM levels:", length(levels(long_data_clean$ITEM)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 10, # 2 TIME × 5 ITEM = 10 - missing_combinations = 10 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × ITEM (5 levels: read, music, tv, nap, travel) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * ITEM + Error(pID/(TIME * ITEM)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "ITEM")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for ITEM (5 levels - within-subjects) -print("Mauchly's Test of Sphericity for ITEM:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_item <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = ITEM, - type = 3, - detailed = TRUE) - - print("ITEM Sphericity Test:") - print(ez_item$Mauchly) - -}, error = function(e) { - print(paste("Error in ITEM sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × ITEM interaction -print("\nMauchly's Test of Sphericity for TIME × ITEM Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - - print("TIME × ITEM Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × ITEM sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, ITEM), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to ITEM effects:") - - # ITEM main effect (DFn = 4, DFd = 4244) - item_df_corrected_gg <- 4 * epsilon_gg - item_df_corrected_hf <- 4 * epsilon_hf - - print(paste("ITEM: Original df = 4, 4244")) - print(paste("ITEM: GG corrected df =", round(item_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("ITEM: HF corrected df =", round(item_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # TIME × ITEM interaction (DFn = 4, DFd = 4244) - interaction_df_corrected_gg <- 4 * epsilon_gg - interaction_df_corrected_hf <- 4 * epsilon_hf - - print(paste("TIME × ITEM: Original df = 4, 4244")) - print(paste("TIME × ITEM: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("TIME × ITEM: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of ITEM -print("\nMain Effect of ITEM:") -item_emmeans <- emmeans(mixed_anova_model, ~ ITEM) -print("Estimated Marginal Means:") -print(item_emmeans) -print("\nPairwise Contrasts:") -item_contrasts <- pairs(item_emmeans, adjust = "bonferroni") -print(item_contrasts) - - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - - -# TIME × ITEM Interaction -print("\n=== TIME × ITEM INTERACTION ===") -time_item_emmeans <- emmeans(mixed_anova_model, ~ TIME * ITEM) -print("Estimated Marginal Means:") -print(time_item_emmeans) - -print("\nSimple Effects of ITEM within each TIME:") -time_item_simple <- pairs(time_item_emmeans, by = "TIME", adjust = "bonferroni") -print(time_item_simple) - -print("\nSimple Effects of TIME within each ITEM:") -time_item_simple2 <- pairs(time_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(time_item_simple2) - - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# ============================================================================= -# 1. TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT: p = 0.001) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × TIME INTERACTION ===") - -# Get simple effects of TIME within each TEMPORAL_DO -temporal_time_simple_df <- as.data.frame(temporal_time_simple) -calculate_cohens_d_for_pairs(temporal_time_simple_df, long_data_clean, "TIME", "TEMPORAL_DO", "MEAN_DIFFERENCE") - -# Get simple effects of TEMPORAL_DO within each TIME -temporal_time_simple2_df <- as.data.frame(temporal_time_simple2) -calculate_cohens_d_for_pairs(temporal_time_simple2_df, long_data_clean, "TEMPORAL_DO", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each ITEM -time_item_simple2_df <- as.data.frame(time_item_simple2) -calculate_cohens_d_for_pairs(time_item_simple2_df, long_data_clean, "TIME", "ITEM", "MEAN_DIFFERENCE") - -# Get simple effects of ITEM within each TIME -time_item_simple_df <- as.data.frame(time_item_simple) -calculate_cohens_d_for_pairs(time_item_simple_df, long_data_clean, "ITEM", "TIME", "MEAN_DIFFERENCE") - -# ============================================================================= -# 3. TEMPORAL_DO × DOMAIN INTERACTION (MARGINAL: p = 0.058) -# ============================================================================= - -print("\n=== COHEN'S D FOR TEMPORAL_DO × DOMAIN INTERACTION ===") - -# Get simple effects of TEMPORAL_DO within each ITEM -temporal_item_simple2_df <- as.data.frame(temporal_item_simple2) -calculate_cohens_d_for_pairs(temporal_item_simple2_df, long_data_clean, "TEMPORAL_DO", "ITEM", "MEAN_DIFFERENCE") - -# Get simple effects of ITEM within each TEMPORAL_DO -temporal_item_simple_df <- as.data.frame(temporal_item_simple) -calculate_cohens_d_for_pairs(temporal_item_simple_df, long_data_clean, "ITEM", "TEMPORAL_DO", "MEAN_DIFFERENCE") - diff --git a/.history/eohi1/mixed anova - preferences_20250916120419.r b/.history/eohi1/mixed anova - preferences_20250916120419.r deleted file mode 100644 index 34d2b75..0000000 --- a/.history/eohi1/mixed anova - preferences_20250916120419.r +++ /dev/null @@ -1,769 +0,0 @@ -# Mixed ANOVA Analysis for Preference Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_pref_read, NPastDiff_pref_music, NPastDiff_pref_tv, NPastDiff_pref_nap, NPastDiff_pref_travel -# NFutDiff_pref_read, NFutDiff_pref_music, NFutDiff_pref_tv, NFutDiff_pref_nap, NFutDiff_pref_travel - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required preference item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("read", "music", "tv", "nap", "travel"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("read", "music", "tv", "nap", "travel")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × ITEM) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × ITEM combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, ITEM, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × ITEM combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, ITEM, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + ITEM, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of ITEM levels:", length(levels(long_data_clean$ITEM)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 10, # 2 TIME × 5 ITEM = 10 - missing_combinations = 10 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × ITEM (5 levels: read, music, tv, nap, travel) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * ITEM + Error(pID/(TIME * ITEM)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "ITEM")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for ITEM (5 levels - within-subjects) -print("Mauchly's Test of Sphericity for ITEM:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_item <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = ITEM, - type = 3, - detailed = TRUE) - - print("ITEM Sphericity Test:") - print(ez_item$Mauchly) - -}, error = function(e) { - print(paste("Error in ITEM sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × ITEM interaction -print("\nMauchly's Test of Sphericity for TIME × ITEM Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - - print("TIME × ITEM Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × ITEM sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, ITEM), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to ITEM effects:") - - # ITEM main effect (DFn = 4, DFd = 4244) - item_df_corrected_gg <- 4 * epsilon_gg - item_df_corrected_hf <- 4 * epsilon_hf - - print(paste("ITEM: Original df = 4, 4244")) - print(paste("ITEM: GG corrected df =", round(item_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("ITEM: HF corrected df =", round(item_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # TIME × ITEM interaction (DFn = 4, DFd = 4244) - interaction_df_corrected_gg <- 4 * epsilon_gg - interaction_df_corrected_hf <- 4 * epsilon_hf - - print(paste("TIME × ITEM: Original df = 4, 4244")) - print(paste("TIME × ITEM: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("TIME × ITEM: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of ITEM -print("\nMain Effect of ITEM:") -item_emmeans <- emmeans(mixed_anova_model, ~ ITEM) -print("Estimated Marginal Means:") -print(item_emmeans) -print("\nPairwise Contrasts:") -item_contrasts <- pairs(item_emmeans, adjust = "bonferroni") -print(item_contrasts) - - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - - -# TIME × ITEM Interaction -print("\n=== TIME × ITEM INTERACTION ===") -time_item_emmeans <- emmeans(mixed_anova_model, ~ TIME * ITEM) -print("Estimated Marginal Means:") -print(time_item_emmeans) - -print("\nSimple Effects of ITEM within each TIME:") -time_item_simple <- pairs(time_item_emmeans, by = "TIME", adjust = "bonferroni") -print(time_item_simple) - -print("\nSimple Effects of TIME within each ITEM:") -time_item_simple2 <- pairs(time_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(time_item_simple2) - - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each ITEM -time_item_simple2_df <- as.data.frame(time_item_simple2) -calculate_cohens_d_for_pairs(time_item_simple2_df, long_data_clean, "TIME", "ITEM", "MEAN_DIFFERENCE") - -# Get simple effects of ITEM within each TIME -time_item_simple_df <- as.data.frame(time_item_simple) -calculate_cohens_d_for_pairs(time_item_simple_df, long_data_clean, "ITEM", "TIME", "MEAN_DIFFERENCE") - - diff --git a/.history/eohi1/mixed anova - preferences_20250916120427.r b/.history/eohi1/mixed anova - preferences_20250916120427.r deleted file mode 100644 index f440f30..0000000 --- a/.history/eohi1/mixed anova - preferences_20250916120427.r +++ /dev/null @@ -1,767 +0,0 @@ -# Mixed ANOVA Analysis for Preference Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_pref_read, NPastDiff_pref_music, NPastDiff_pref_tv, NPastDiff_pref_nap, NPastDiff_pref_travel -# NFutDiff_pref_read, NFutDiff_pref_music, NFutDiff_pref_tv, NFutDiff_pref_nap, NFutDiff_pref_travel - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required preference item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("read", "music", "tv", "nap", "travel"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("read", "music", "tv", "nap", "travel")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × ITEM) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × ITEM combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, ITEM, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × ITEM combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, ITEM, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + ITEM, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of ITEM levels:", length(levels(long_data_clean$ITEM)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 10, # 2 TIME × 5 ITEM = 10 - missing_combinations = 10 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Repeated Measures ANOVA using aov() - Traditional approach -# Within-subjects: TIME (2 levels: Past, Future) × ITEM (5 levels: read, music, tv, nap, travel) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TIME * ITEM + Error(pID/(TIME * ITEM)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - within = c("TIME", "ITEM")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for ITEM (5 levels - within-subjects) -print("Mauchly's Test of Sphericity for ITEM:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_item <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = ITEM, - type = 3, - detailed = TRUE) - - print("ITEM Sphericity Test:") - print(ez_item$Mauchly) - -}, error = function(e) { - print(paste("Error in ITEM sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × ITEM interaction -print("\nMauchly's Test of Sphericity for TIME × ITEM Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - between = TEMPORAL_DO, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - - print("TIME × ITEM Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × ITEM sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - between = TEMPORAL_DO, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, ITEM), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to ITEM effects:") - - # ITEM main effect (DFn = 4, DFd = 4244) - item_df_corrected_gg <- 4 * epsilon_gg - item_df_corrected_hf <- 4 * epsilon_hf - - print(paste("ITEM: Original df = 4, 4244")) - print(paste("ITEM: GG corrected df =", round(item_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("ITEM: HF corrected df =", round(item_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # TIME × ITEM interaction (DFn = 4, DFd = 4244) - interaction_df_corrected_gg <- 4 * epsilon_gg - interaction_df_corrected_hf <- 4 * epsilon_hf - - print(paste("TIME × ITEM: Original df = 4, 4244")) - print(paste("TIME × ITEM: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("TIME × ITEM: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of ITEM -print("\nMain Effect of ITEM:") -item_emmeans <- emmeans(mixed_anova_model, ~ ITEM) -print("Estimated Marginal Means:") -print(item_emmeans) -print("\nPairwise Contrasts:") -item_contrasts <- pairs(item_emmeans, adjust = "bonferroni") -print(item_contrasts) - - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - - -# TIME × ITEM Interaction -print("\n=== TIME × ITEM INTERACTION ===") -time_item_emmeans <- emmeans(mixed_anova_model, ~ TIME * ITEM) -print("Estimated Marginal Means:") -print(time_item_emmeans) - -print("\nSimple Effects of ITEM within each TIME:") -time_item_simple <- pairs(time_item_emmeans, by = "TIME", adjust = "bonferroni") -print(time_item_simple) - -print("\nSimple Effects of TIME within each ITEM:") -time_item_simple2 <- pairs(time_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(time_item_simple2) - - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each ITEM -time_item_simple2_df <- as.data.frame(time_item_simple2) -calculate_cohens_d_for_pairs(time_item_simple2_df, long_data_clean, "TIME", "ITEM", "MEAN_DIFFERENCE") - -# Get simple effects of ITEM within each TIME -time_item_simple_df <- as.data.frame(time_item_simple) -calculate_cohens_d_for_pairs(time_item_simple_df, long_data_clean, "ITEM", "TIME", "MEAN_DIFFERENCE") - - diff --git a/.history/eohi1/mixed anova - preferences_20250916120436.r b/.history/eohi1/mixed anova - preferences_20250916120436.r deleted file mode 100644 index 406272e..0000000 --- a/.history/eohi1/mixed anova - preferences_20250916120436.r +++ /dev/null @@ -1,763 +0,0 @@ -# Mixed ANOVA Analysis for Preference Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_pref_read, NPastDiff_pref_music, NPastDiff_pref_tv, NPastDiff_pref_nap, NPastDiff_pref_travel -# NFutDiff_pref_read, NFutDiff_pref_music, NFutDiff_pref_tv, NFutDiff_pref_nap, NFutDiff_pref_travel - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required preference item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("read", "music", "tv", "nap", "travel"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("read", "music", "tv", "nap", "travel")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × ITEM) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × ITEM combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, ITEM, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × ITEM combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, ITEM, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + ITEM, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of ITEM levels:", length(levels(long_data_clean$ITEM)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 10, # 2 TIME × 5 ITEM = 10 - missing_combinations = 10 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Repeated Measures ANOVA using aov() - Traditional approach -# Within-subjects: TIME (2 levels: Past, Future) × ITEM (5 levels: read, music, tv, nap, travel) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TIME * ITEM + Error(pID/(TIME * ITEM)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - within = c("TIME", "ITEM")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for ITEM (5 levels - within-subjects) -print("Mauchly's Test of Sphericity for ITEM:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_item <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = ITEM, - type = 3, - detailed = TRUE) - - print("ITEM Sphericity Test:") - print(ez_item$Mauchly) - -}, error = function(e) { - print(paste("Error in ITEM sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × ITEM interaction -print("\nMauchly's Test of Sphericity for TIME × ITEM Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - - print("TIME × ITEM Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × ITEM sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, ITEM), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to ITEM effects:") - - # ITEM main effect (DFn = 4, DFd = 4244) - item_df_corrected_gg <- 4 * epsilon_gg - item_df_corrected_hf <- 4 * epsilon_hf - - print(paste("ITEM: Original df = 4, 4244")) - print(paste("ITEM: GG corrected df =", round(item_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("ITEM: HF corrected df =", round(item_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # TIME × ITEM interaction (DFn = 4, DFd = 4244) - interaction_df_corrected_gg <- 4 * epsilon_gg - interaction_df_corrected_hf <- 4 * epsilon_hf - - print(paste("TIME × ITEM: Original df = 4, 4244")) - print(paste("TIME × ITEM: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("TIME × ITEM: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of ITEM -print("\nMain Effect of ITEM:") -item_emmeans <- emmeans(mixed_anova_model, ~ ITEM) -print("Estimated Marginal Means:") -print(item_emmeans) -print("\nPairwise Contrasts:") -item_contrasts <- pairs(item_emmeans, adjust = "bonferroni") -print(item_contrasts) - - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - - -# TIME × ITEM Interaction -print("\n=== TIME × ITEM INTERACTION ===") -time_item_emmeans <- emmeans(mixed_anova_model, ~ TIME * ITEM) -print("Estimated Marginal Means:") -print(time_item_emmeans) - -print("\nSimple Effects of ITEM within each TIME:") -time_item_simple <- pairs(time_item_emmeans, by = "TIME", adjust = "bonferroni") -print(time_item_simple) - -print("\nSimple Effects of TIME within each ITEM:") -time_item_simple2 <- pairs(time_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(time_item_simple2) - - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each ITEM -time_item_simple2_df <- as.data.frame(time_item_simple2) -calculate_cohens_d_for_pairs(time_item_simple2_df, long_data_clean, "TIME", "ITEM", "MEAN_DIFFERENCE") - -# Get simple effects of ITEM within each TIME -time_item_simple_df <- as.data.frame(time_item_simple) -calculate_cohens_d_for_pairs(time_item_simple_df, long_data_clean, "ITEM", "TIME", "MEAN_DIFFERENCE") - - diff --git a/.history/eohi1/mixed anova - preferences_20250916120437.r b/.history/eohi1/mixed anova - preferences_20250916120437.r deleted file mode 100644 index 406272e..0000000 --- a/.history/eohi1/mixed anova - preferences_20250916120437.r +++ /dev/null @@ -1,763 +0,0 @@ -# Mixed ANOVA Analysis for Preference Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_pref_read, NPastDiff_pref_music, NPastDiff_pref_tv, NPastDiff_pref_nap, NPastDiff_pref_travel -# NFutDiff_pref_read, NFutDiff_pref_music, NFutDiff_pref_tv, NFutDiff_pref_nap, NFutDiff_pref_travel - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required preference item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("read", "music", "tv", "nap", "travel"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("read", "music", "tv", "nap", "travel")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × ITEM) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × ITEM combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, ITEM, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × ITEM combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, ITEM, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + ITEM, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of ITEM levels:", length(levels(long_data_clean$ITEM)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 10, # 2 TIME × 5 ITEM = 10 - missing_combinations = 10 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Repeated Measures ANOVA using aov() - Traditional approach -# Within-subjects: TIME (2 levels: Past, Future) × ITEM (5 levels: read, music, tv, nap, travel) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TIME * ITEM + Error(pID/(TIME * ITEM)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - within = c("TIME", "ITEM")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for ITEM (5 levels - within-subjects) -print("Mauchly's Test of Sphericity for ITEM:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_item <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = ITEM, - type = 3, - detailed = TRUE) - - print("ITEM Sphericity Test:") - print(ez_item$Mauchly) - -}, error = function(e) { - print(paste("Error in ITEM sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × ITEM interaction -print("\nMauchly's Test of Sphericity for TIME × ITEM Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - - print("TIME × ITEM Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × ITEM sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, ITEM), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to ITEM effects:") - - # ITEM main effect (DFn = 4, DFd = 4244) - item_df_corrected_gg <- 4 * epsilon_gg - item_df_corrected_hf <- 4 * epsilon_hf - - print(paste("ITEM: Original df = 4, 4244")) - print(paste("ITEM: GG corrected df =", round(item_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("ITEM: HF corrected df =", round(item_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # TIME × ITEM interaction (DFn = 4, DFd = 4244) - interaction_df_corrected_gg <- 4 * epsilon_gg - interaction_df_corrected_hf <- 4 * epsilon_hf - - print(paste("TIME × ITEM: Original df = 4, 4244")) - print(paste("TIME × ITEM: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("TIME × ITEM: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of ITEM -print("\nMain Effect of ITEM:") -item_emmeans <- emmeans(mixed_anova_model, ~ ITEM) -print("Estimated Marginal Means:") -print(item_emmeans) -print("\nPairwise Contrasts:") -item_contrasts <- pairs(item_emmeans, adjust = "bonferroni") -print(item_contrasts) - - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - - -# TIME × ITEM Interaction -print("\n=== TIME × ITEM INTERACTION ===") -time_item_emmeans <- emmeans(mixed_anova_model, ~ TIME * ITEM) -print("Estimated Marginal Means:") -print(time_item_emmeans) - -print("\nSimple Effects of ITEM within each TIME:") -time_item_simple <- pairs(time_item_emmeans, by = "TIME", adjust = "bonferroni") -print(time_item_simple) - -print("\nSimple Effects of TIME within each ITEM:") -time_item_simple2 <- pairs(time_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(time_item_simple2) - - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each ITEM -time_item_simple2_df <- as.data.frame(time_item_simple2) -calculate_cohens_d_for_pairs(time_item_simple2_df, long_data_clean, "TIME", "ITEM", "MEAN_DIFFERENCE") - -# Get simple effects of ITEM within each TIME -time_item_simple_df <- as.data.frame(time_item_simple) -calculate_cohens_d_for_pairs(time_item_simple_df, long_data_clean, "ITEM", "TIME", "MEAN_DIFFERENCE") - - diff --git a/.history/eohi1/mixed anova - preferences_20250916120515.r b/.history/eohi1/mixed anova - preferences_20250916120515.r deleted file mode 100644 index 57d9e96..0000000 --- a/.history/eohi1/mixed anova - preferences_20250916120515.r +++ /dev/null @@ -1,765 +0,0 @@ -# Mixed ANOVA Analysis for Preference Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_pref_read, NPastDiff_pref_music, NPastDiff_pref_tv, NPastDiff_pref_nap, NPastDiff_pref_travel -# NFutDiff_pref_read, NFutDiff_pref_music, NFutDiff_pref_tv, NFutDiff_pref_nap, NFutDiff_pref_travel - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required preference item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("read", "music", "tv", "nap", "travel"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("read", "music", "tv", "nap", "travel")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × ITEM) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × ITEM combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, ITEM, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × ITEM combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, ITEM, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + ITEM, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of ITEM levels:", length(levels(long_data_clean$ITEM)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 10, # 2 TIME × 5 ITEM = 10 - missing_combinations = 10 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × ITEM (5 levels: read, music, tv, nap, travel) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * ITEM + Error(pID/(TIME * ITEM)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "ITEM")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for ITEM (5 levels - within-subjects) -print("Mauchly's Test of Sphericity for ITEM:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_item <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = ITEM, - type = 3, - detailed = TRUE) - - print("ITEM Sphericity Test:") - print(ez_item$Mauchly) - -}, error = function(e) { - print(paste("Error in ITEM sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × ITEM interaction -print("\nMauchly's Test of Sphericity for TIME × ITEM Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - - print("TIME × ITEM Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × ITEM sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, ITEM), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to ITEM effects:") - - # ITEM main effect (DFn = 4, DFd = 4244) - item_df_corrected_gg <- 4 * epsilon_gg - item_df_corrected_hf <- 4 * epsilon_hf - - print(paste("ITEM: Original df = 4, 4244")) - print(paste("ITEM: GG corrected df =", round(item_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("ITEM: HF corrected df =", round(item_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # TIME × ITEM interaction (DFn = 4, DFd = 4244) - interaction_df_corrected_gg <- 4 * epsilon_gg - interaction_df_corrected_hf <- 4 * epsilon_hf - - print(paste("TIME × ITEM: Original df = 4, 4244")) - print(paste("TIME × ITEM: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("TIME × ITEM: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of ITEM -print("\nMain Effect of ITEM:") -item_emmeans <- emmeans(mixed_anova_model, ~ ITEM) -print("Estimated Marginal Means:") -print(item_emmeans) -print("\nPairwise Contrasts:") -item_contrasts <- pairs(item_emmeans, adjust = "bonferroni") -print(item_contrasts) - - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - - -# TIME × ITEM Interaction -print("\n=== TIME × ITEM INTERACTION ===") -time_item_emmeans <- emmeans(mixed_anova_model, ~ TIME * ITEM) -print("Estimated Marginal Means:") -print(time_item_emmeans) - -print("\nSimple Effects of ITEM within each TIME:") -time_item_simple <- pairs(time_item_emmeans, by = "TIME", adjust = "bonferroni") -print(time_item_simple) - -print("\nSimple Effects of TIME within each ITEM:") -time_item_simple2 <- pairs(time_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(time_item_simple2) - - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each ITEM -time_item_simple2_df <- as.data.frame(time_item_simple2) -calculate_cohens_d_for_pairs(time_item_simple2_df, long_data_clean, "TIME", "ITEM", "MEAN_DIFFERENCE") - -# Get simple effects of ITEM within each TIME -time_item_simple_df <- as.data.frame(time_item_simple) -calculate_cohens_d_for_pairs(time_item_simple_df, long_data_clean, "ITEM", "TIME", "MEAN_DIFFERENCE") - - diff --git a/.history/eohi1/mixed anova - preferences_20250916120522.r b/.history/eohi1/mixed anova - preferences_20250916120522.r deleted file mode 100644 index 57d9e96..0000000 --- a/.history/eohi1/mixed anova - preferences_20250916120522.r +++ /dev/null @@ -1,765 +0,0 @@ -# Mixed ANOVA Analysis for Preference Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_pref_read, NPastDiff_pref_music, NPastDiff_pref_tv, NPastDiff_pref_nap, NPastDiff_pref_travel -# NFutDiff_pref_read, NFutDiff_pref_music, NFutDiff_pref_tv, NFutDiff_pref_nap, NFutDiff_pref_travel - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required preference item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("read", "music", "tv", "nap", "travel"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("read", "music", "tv", "nap", "travel")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × ITEM) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × ITEM combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, ITEM, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × ITEM combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, ITEM, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + ITEM, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of ITEM levels:", length(levels(long_data_clean$ITEM)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 10, # 2 TIME × 5 ITEM = 10 - missing_combinations = 10 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × ITEM (5 levels: read, music, tv, nap, travel) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * ITEM + Error(pID/(TIME * ITEM)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "ITEM")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for ITEM (5 levels - within-subjects) -print("Mauchly's Test of Sphericity for ITEM:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_item <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = ITEM, - type = 3, - detailed = TRUE) - - print("ITEM Sphericity Test:") - print(ez_item$Mauchly) - -}, error = function(e) { - print(paste("Error in ITEM sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × ITEM interaction -print("\nMauchly's Test of Sphericity for TIME × ITEM Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - - print("TIME × ITEM Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × ITEM sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, ITEM), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to ITEM effects:") - - # ITEM main effect (DFn = 4, DFd = 4244) - item_df_corrected_gg <- 4 * epsilon_gg - item_df_corrected_hf <- 4 * epsilon_hf - - print(paste("ITEM: Original df = 4, 4244")) - print(paste("ITEM: GG corrected df =", round(item_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("ITEM: HF corrected df =", round(item_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # TIME × ITEM interaction (DFn = 4, DFd = 4244) - interaction_df_corrected_gg <- 4 * epsilon_gg - interaction_df_corrected_hf <- 4 * epsilon_hf - - print(paste("TIME × ITEM: Original df = 4, 4244")) - print(paste("TIME × ITEM: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("TIME × ITEM: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of ITEM -print("\nMain Effect of ITEM:") -item_emmeans <- emmeans(mixed_anova_model, ~ ITEM) -print("Estimated Marginal Means:") -print(item_emmeans) -print("\nPairwise Contrasts:") -item_contrasts <- pairs(item_emmeans, adjust = "bonferroni") -print(item_contrasts) - - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - - -# TIME × ITEM Interaction -print("\n=== TIME × ITEM INTERACTION ===") -time_item_emmeans <- emmeans(mixed_anova_model, ~ TIME * ITEM) -print("Estimated Marginal Means:") -print(time_item_emmeans) - -print("\nSimple Effects of ITEM within each TIME:") -time_item_simple <- pairs(time_item_emmeans, by = "TIME", adjust = "bonferroni") -print(time_item_simple) - -print("\nSimple Effects of TIME within each ITEM:") -time_item_simple2 <- pairs(time_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(time_item_simple2) - - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each ITEM -time_item_simple2_df <- as.data.frame(time_item_simple2) -calculate_cohens_d_for_pairs(time_item_simple2_df, long_data_clean, "TIME", "ITEM", "MEAN_DIFFERENCE") - -# Get simple effects of ITEM within each TIME -time_item_simple_df <- as.data.frame(time_item_simple) -calculate_cohens_d_for_pairs(time_item_simple_df, long_data_clean, "ITEM", "TIME", "MEAN_DIFFERENCE") - - diff --git a/.history/eohi1/mixed anova - preferences_20250916120534.r b/.history/eohi1/mixed anova - preferences_20250916120534.r deleted file mode 100644 index 57d9e96..0000000 --- a/.history/eohi1/mixed anova - preferences_20250916120534.r +++ /dev/null @@ -1,765 +0,0 @@ -# Mixed ANOVA Analysis for Preference Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_pref_read, NPastDiff_pref_music, NPastDiff_pref_tv, NPastDiff_pref_nap, NPastDiff_pref_travel -# NFutDiff_pref_read, NFutDiff_pref_music, NFutDiff_pref_tv, NFutDiff_pref_nap, NFutDiff_pref_travel - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required preference item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", "NPastDiff_pref_nap", "NPastDiff_pref_travel", - "NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", "NFutDiff_pref_nap", "NFutDiff_pref_travel"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("read", "music", "tv", "nap", "travel"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("read", "music", "tv", "nap", "travel")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × ITEM) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × ITEM combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, ITEM, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × ITEM combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, ITEM, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + ITEM, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of ITEM levels:", length(levels(long_data_clean$ITEM)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 10, # 2 TIME × 5 ITEM = 10 - missing_combinations = 10 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × ITEM (5 levels: read, music, tv, nap, travel) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * ITEM + Error(pID/(TIME * ITEM)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "ITEM")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for ITEM (5 levels - within-subjects) -print("Mauchly's Test of Sphericity for ITEM:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_item <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = ITEM, - type = 3, - detailed = TRUE) - - print("ITEM Sphericity Test:") - print(ez_item$Mauchly) - -}, error = function(e) { - print(paste("Error in ITEM sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × ITEM interaction -print("\nMauchly's Test of Sphericity for TIME × ITEM Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - - print("TIME × ITEM Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × ITEM sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, ITEM), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to ITEM effects:") - - # ITEM main effect (DFn = 4, DFd = 4244) - item_df_corrected_gg <- 4 * epsilon_gg - item_df_corrected_hf <- 4 * epsilon_hf - - print(paste("ITEM: Original df = 4, 4244")) - print(paste("ITEM: GG corrected df =", round(item_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("ITEM: HF corrected df =", round(item_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # TIME × ITEM interaction (DFn = 4, DFd = 4244) - interaction_df_corrected_gg <- 4 * epsilon_gg - interaction_df_corrected_hf <- 4 * epsilon_hf - - print(paste("TIME × ITEM: Original df = 4, 4244")) - print(paste("TIME × ITEM: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("TIME × ITEM: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of ITEM -print("\nMain Effect of ITEM:") -item_emmeans <- emmeans(mixed_anova_model, ~ ITEM) -print("Estimated Marginal Means:") -print(item_emmeans) -print("\nPairwise Contrasts:") -item_contrasts <- pairs(item_emmeans, adjust = "bonferroni") -print(item_contrasts) - - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - - -# TIME × ITEM Interaction -print("\n=== TIME × ITEM INTERACTION ===") -time_item_emmeans <- emmeans(mixed_anova_model, ~ TIME * ITEM) -print("Estimated Marginal Means:") -print(time_item_emmeans) - -print("\nSimple Effects of ITEM within each TIME:") -time_item_simple <- pairs(time_item_emmeans, by = "TIME", adjust = "bonferroni") -print(time_item_simple) - -print("\nSimple Effects of TIME within each ITEM:") -time_item_simple2 <- pairs(time_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(time_item_simple2) - - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each ITEM -time_item_simple2_df <- as.data.frame(time_item_simple2) -calculate_cohens_d_for_pairs(time_item_simple2_df, long_data_clean, "TIME", "ITEM", "MEAN_DIFFERENCE") - -# Get simple effects of ITEM within each TIME -time_item_simple_df <- as.data.frame(time_item_simple) -calculate_cohens_d_for_pairs(time_item_simple_df, long_data_clean, "ITEM", "TIME", "MEAN_DIFFERENCE") - - diff --git a/.history/eohi1/mixed anova - values_20250916125551.r b/.history/eohi1/mixed anova - values_20250916125551.r deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi1/mixed anova - values_20250916125552.r b/.history/eohi1/mixed anova - values_20250916125552.r deleted file mode 100644 index 70d9919..0000000 --- a/.history/eohi1/mixed anova - values_20250916125552.r +++ /dev/null @@ -1,765 +0,0 @@ -# Mixed ANOVA Analysis for Personality Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_pers_extravert, NPastDiff_pers_critical, NPastDiff_pers_dependable, NPastDiff_pers_anxious, NPastDiff_pers_complex -# NFutDiff_pers_extravert, NFutDiff_pers_critical, NFutDiff_pers_dependable, NFutDiff_pers_anxious, NFutDiff_pers_complex - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", "NPastDiff_pers_anxious", "NPastDiff_pers_complex", - "NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", "NFutDiff_pers_anxious", "NFutDiff_pers_complex") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required personality item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", "NPastDiff_pers_anxious", "NPastDiff_pers_complex", - "NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", "NFutDiff_pers_anxious", "NFutDiff_pers_complex"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("extravert", "critical", "dependable", "anxious", "complex"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("extravert", "critical", "dependable", "anxious", "complex")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × ITEM) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × ITEM combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, ITEM, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × ITEM combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, ITEM, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + ITEM, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of ITEM levels:", length(levels(long_data_clean$ITEM)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 10, # 2 TIME × 5 ITEM = 10 - missing_combinations = 10 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × ITEM (5 levels: read, music, tv, nap, travel) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * ITEM + Error(pID/(TIME * ITEM)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "ITEM")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for ITEM (5 levels - within-subjects) -print("Mauchly's Test of Sphericity for ITEM:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_item <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = ITEM, - type = 3, - detailed = TRUE) - - print("ITEM Sphericity Test:") - print(ez_item$Mauchly) - -}, error = function(e) { - print(paste("Error in ITEM sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × ITEM interaction -print("\nMauchly's Test of Sphericity for TIME × ITEM Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - - print("TIME × ITEM Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × ITEM sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, ITEM), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to ITEM effects:") - - # ITEM main effect (DFn = 4, DFd = 4244) - item_df_corrected_gg <- 4 * epsilon_gg - item_df_corrected_hf <- 4 * epsilon_hf - - print(paste("ITEM: Original df = 4, 4244")) - print(paste("ITEM: GG corrected df =", round(item_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("ITEM: HF corrected df =", round(item_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # TIME × ITEM interaction (DFn = 4, DFd = 4244) - interaction_df_corrected_gg <- 4 * epsilon_gg - interaction_df_corrected_hf <- 4 * epsilon_hf - - print(paste("TIME × ITEM: Original df = 4, 4244")) - print(paste("TIME × ITEM: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("TIME × ITEM: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of ITEM -print("\nMain Effect of ITEM:") -item_emmeans <- emmeans(mixed_anova_model, ~ ITEM) -print("Estimated Marginal Means:") -print(item_emmeans) -print("\nPairwise Contrasts:") -item_contrasts <- pairs(item_emmeans, adjust = "bonferroni") -print(item_contrasts) - - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - - -# TIME × ITEM Interaction -print("\n=== TIME × ITEM INTERACTION ===") -time_item_emmeans <- emmeans(mixed_anova_model, ~ TIME * ITEM) -print("Estimated Marginal Means:") -print(time_item_emmeans) - -print("\nSimple Effects of ITEM within each TIME:") -time_item_simple <- pairs(time_item_emmeans, by = "TIME", adjust = "bonferroni") -print(time_item_simple) - -print("\nSimple Effects of TIME within each ITEM:") -time_item_simple2 <- pairs(time_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(time_item_simple2) - - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each ITEM -time_item_simple2_df <- as.data.frame(time_item_simple2) -calculate_cohens_d_for_pairs(time_item_simple2_df, long_data_clean, "TIME", "ITEM", "MEAN_DIFFERENCE") - -# Get simple effects of ITEM within each TIME -time_item_simple_df <- as.data.frame(time_item_simple) -calculate_cohens_d_for_pairs(time_item_simple_df, long_data_clean, "ITEM", "TIME", "MEAN_DIFFERENCE") - - diff --git a/.history/eohi1/mixed anova - values_20250916125857.r b/.history/eohi1/mixed anova - values_20250916125857.r deleted file mode 100644 index 84de04b..0000000 --- a/.history/eohi1/mixed anova - values_20250916125857.r +++ /dev/null @@ -1,765 +0,0 @@ -# Mixed ANOVA Analysis for Values Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_val_obey, NPastDiff_val_trad, NPastDiff_val_opinion, NPastDiff_val_performance, NPastDiff_val_justice -# NFutDiff_val_obey, NFutDiff_val_trad, NFutDiff_val_opinion, NFutDiff_val_performance, NFutDiff_val_justice - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_val_obey", "NPastDiff_val_trad", "NPastDiff_val_opinion", "NPastDiff_val_performance", "NPastDiff_val_justice", - "NFutDiff_val_obey", "NFutDiff_val_trad", "NFutDiff_val_opinion", "NFutDiff_val_performance", "NFutDiff_val_justice") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required values item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_val_obey", "NPastDiff_val_trad", "NPastDiff_val_opinion", "NPastDiff_val_performance", "NPastDiff_val_justice", - "NFutDiff_val_obey", "NFutDiff_val_trad", "NFutDiff_val_opinion", "NFutDiff_val_performance", "NFutDiff_val_justice"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("obey", "trad", "opinion", "performance", "justice"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("obey", "trad", "opinion", "performance", "justice")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × ITEM) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × ITEM combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, ITEM, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × ITEM combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, ITEM, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + ITEM, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of ITEM levels:", length(levels(long_data_clean$ITEM)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 10, # 2 TIME × 5 ITEM = 10 - missing_combinations = 10 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × ITEM (5 levels: read, music, tv, nap, travel) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * ITEM + Error(pID/(TIME * ITEM)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "ITEM")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for ITEM (5 levels - within-subjects) -print("Mauchly's Test of Sphericity for ITEM:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_item <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = ITEM, - type = 3, - detailed = TRUE) - - print("ITEM Sphericity Test:") - print(ez_item$Mauchly) - -}, error = function(e) { - print(paste("Error in ITEM sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × ITEM interaction -print("\nMauchly's Test of Sphericity for TIME × ITEM Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - - print("TIME × ITEM Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × ITEM sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, ITEM), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to ITEM effects:") - - # ITEM main effect (DFn = 4, DFd = 4244) - item_df_corrected_gg <- 4 * epsilon_gg - item_df_corrected_hf <- 4 * epsilon_hf - - print(paste("ITEM: Original df = 4, 4244")) - print(paste("ITEM: GG corrected df =", round(item_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("ITEM: HF corrected df =", round(item_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # TIME × ITEM interaction (DFn = 4, DFd = 4244) - interaction_df_corrected_gg <- 4 * epsilon_gg - interaction_df_corrected_hf <- 4 * epsilon_hf - - print(paste("TIME × ITEM: Original df = 4, 4244")) - print(paste("TIME × ITEM: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("TIME × ITEM: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of ITEM -print("\nMain Effect of ITEM:") -item_emmeans <- emmeans(mixed_anova_model, ~ ITEM) -print("Estimated Marginal Means:") -print(item_emmeans) -print("\nPairwise Contrasts:") -item_contrasts <- pairs(item_emmeans, adjust = "bonferroni") -print(item_contrasts) - - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - - -# TIME × ITEM Interaction -print("\n=== TIME × ITEM INTERACTION ===") -time_item_emmeans <- emmeans(mixed_anova_model, ~ TIME * ITEM) -print("Estimated Marginal Means:") -print(time_item_emmeans) - -print("\nSimple Effects of ITEM within each TIME:") -time_item_simple <- pairs(time_item_emmeans, by = "TIME", adjust = "bonferroni") -print(time_item_simple) - -print("\nSimple Effects of TIME within each ITEM:") -time_item_simple2 <- pairs(time_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(time_item_simple2) - - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each ITEM -time_item_simple2_df <- as.data.frame(time_item_simple2) -calculate_cohens_d_for_pairs(time_item_simple2_df, long_data_clean, "TIME", "ITEM", "MEAN_DIFFERENCE") - -# Get simple effects of ITEM within each TIME -time_item_simple_df <- as.data.frame(time_item_simple) -calculate_cohens_d_for_pairs(time_item_simple_df, long_data_clean, "ITEM", "TIME", "MEAN_DIFFERENCE") - - diff --git a/.history/eohi1/mixed anova - values_20250916125907.r b/.history/eohi1/mixed anova - values_20250916125907.r deleted file mode 100644 index 84de04b..0000000 --- a/.history/eohi1/mixed anova - values_20250916125907.r +++ /dev/null @@ -1,765 +0,0 @@ -# Mixed ANOVA Analysis for Values Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_val_obey, NPastDiff_val_trad, NPastDiff_val_opinion, NPastDiff_val_performance, NPastDiff_val_justice -# NFutDiff_val_obey, NFutDiff_val_trad, NFutDiff_val_opinion, NFutDiff_val_performance, NFutDiff_val_justice - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_val_obey", "NPastDiff_val_trad", "NPastDiff_val_opinion", "NPastDiff_val_performance", "NPastDiff_val_justice", - "NFutDiff_val_obey", "NFutDiff_val_trad", "NFutDiff_val_opinion", "NFutDiff_val_performance", "NFutDiff_val_justice") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required values item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_val_obey", "NPastDiff_val_trad", "NPastDiff_val_opinion", "NPastDiff_val_performance", "NPastDiff_val_justice", - "NFutDiff_val_obey", "NFutDiff_val_trad", "NFutDiff_val_opinion", "NFutDiff_val_performance", "NFutDiff_val_justice"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("obey", "trad", "opinion", "performance", "justice"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("obey", "trad", "opinion", "performance", "justice")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × ITEM) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × ITEM combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, ITEM, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × ITEM combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, ITEM, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + ITEM, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of ITEM levels:", length(levels(long_data_clean$ITEM)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 10, # 2 TIME × 5 ITEM = 10 - missing_combinations = 10 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × ITEM (5 levels: read, music, tv, nap, travel) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * ITEM + Error(pID/(TIME * ITEM)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "ITEM")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for ITEM (5 levels - within-subjects) -print("Mauchly's Test of Sphericity for ITEM:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_item <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = ITEM, - type = 3, - detailed = TRUE) - - print("ITEM Sphericity Test:") - print(ez_item$Mauchly) - -}, error = function(e) { - print(paste("Error in ITEM sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × ITEM interaction -print("\nMauchly's Test of Sphericity for TIME × ITEM Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - - print("TIME × ITEM Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × ITEM sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, ITEM), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to ITEM effects:") - - # ITEM main effect (DFn = 4, DFd = 4244) - item_df_corrected_gg <- 4 * epsilon_gg - item_df_corrected_hf <- 4 * epsilon_hf - - print(paste("ITEM: Original df = 4, 4244")) - print(paste("ITEM: GG corrected df =", round(item_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("ITEM: HF corrected df =", round(item_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # TIME × ITEM interaction (DFn = 4, DFd = 4244) - interaction_df_corrected_gg <- 4 * epsilon_gg - interaction_df_corrected_hf <- 4 * epsilon_hf - - print(paste("TIME × ITEM: Original df = 4, 4244")) - print(paste("TIME × ITEM: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("TIME × ITEM: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of ITEM -print("\nMain Effect of ITEM:") -item_emmeans <- emmeans(mixed_anova_model, ~ ITEM) -print("Estimated Marginal Means:") -print(item_emmeans) -print("\nPairwise Contrasts:") -item_contrasts <- pairs(item_emmeans, adjust = "bonferroni") -print(item_contrasts) - - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - - -# TIME × ITEM Interaction -print("\n=== TIME × ITEM INTERACTION ===") -time_item_emmeans <- emmeans(mixed_anova_model, ~ TIME * ITEM) -print("Estimated Marginal Means:") -print(time_item_emmeans) - -print("\nSimple Effects of ITEM within each TIME:") -time_item_simple <- pairs(time_item_emmeans, by = "TIME", adjust = "bonferroni") -print(time_item_simple) - -print("\nSimple Effects of TIME within each ITEM:") -time_item_simple2 <- pairs(time_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(time_item_simple2) - - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each ITEM -time_item_simple2_df <- as.data.frame(time_item_simple2) -calculate_cohens_d_for_pairs(time_item_simple2_df, long_data_clean, "TIME", "ITEM", "MEAN_DIFFERENCE") - -# Get simple effects of ITEM within each TIME -time_item_simple_df <- as.data.frame(time_item_simple) -calculate_cohens_d_for_pairs(time_item_simple_df, long_data_clean, "ITEM", "TIME", "MEAN_DIFFERENCE") - - diff --git a/.history/eohi1/mixed anova - values_20250916125940.r b/.history/eohi1/mixed anova - values_20250916125940.r deleted file mode 100644 index 84de04b..0000000 --- a/.history/eohi1/mixed anova - values_20250916125940.r +++ /dev/null @@ -1,765 +0,0 @@ -# Mixed ANOVA Analysis for Values Items -# EOHI Experiment Data Analysis - Item Level Analysis -# Variables: NPastDiff_val_obey, NPastDiff_val_trad, NPastDiff_val_opinion, NPastDiff_val_performance, NPastDiff_val_justice -# NFutDiff_val_obey, NFutDiff_val_trad, NFutDiff_val_opinion, NFutDiff_val_performance, NFutDiff_val_justice - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -# Read the data -data <- read.csv("exp1.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("NPastDiff_val_obey", "NPastDiff_val_trad", "NPastDiff_val_opinion", "NPastDiff_val_performance", "NPastDiff_val_justice", - "NFutDiff_val_obey", "NFutDiff_val_trad", "NFutDiff_val_opinion", "NFutDiff_val_performance", "NFutDiff_val_justice") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required values item variables found!") -} - -# Define item mapping -item_mapping <- data.frame( - variable = c("NPastDiff_val_obey", "NPastDiff_val_trad", "NPastDiff_val_opinion", "NPastDiff_val_performance", "NPastDiff_val_justice", - "NFutDiff_val_obey", "NFutDiff_val_trad", "NFutDiff_val_opinion", "NFutDiff_val_performance", "NFutDiff_val_justice"), - time = c(rep("Past", 5), rep("Future", 5)), - item = rep(c("obey", "trad", "opinion", "performance", "justice"), 2), - stringsAsFactors = FALSE -) - -# Item mapping created - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, TEMPORAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(item_mapping, by = "variable") %>% - # Convert to factors with proper levels (note: columns are 'time' and 'item' from mapping) - mutate( - TIME = factor(time, levels = c("Past", "Future")), - ITEM = factor(item, levels = c("obey", "trad", "opinion", "performance", "justice")), - pID = as.factor(pID), - TEMPORAL_DO = as.factor(TEMPORAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and ITEM -desc_stats <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and ITEM:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_temporal <- long_data %>% - group_by(TEMPORAL_DO, TIME, ITEM) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TEMPORAL_DO, TIME, and ITEM:") -print(desc_stats_by_temporal) - - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, ITEM) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and ITEM:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test (streamlined) -normality_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each ITEM -homogeneity_time <- long_data_clean %>% - group_by(ITEM) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each ITEM:") -print(homogeneity_time) - -# Test homogeneity across ITEM within each TIME -homogeneity_item <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ ITEM)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across ITEM within each TIME:") -print(homogeneity_item) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# ============================================================================= -# CALCULATE OBSERVED F-MAX RATIOS FOR MIXED ANOVA -# ============================================================================= - -# For mixed ANOVA: Test homogeneity across BETWEEN-SUBJECTS factor (TEMPORAL_DO) -# within each combination of within-subjects factors (TIME × ITEM) - -# First, let's check what values TEMPORAL_DO actually has -print("=== CHECKING TEMPORAL_DO VALUES ===") -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("TEMPORAL_DO value counts:") -print(table(long_data_clean$TEMPORAL_DO)) - -print("\n=== OBSERVED F-MAX RATIOS: TEMPORAL_DO within each TIME × ITEM combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this TIME × DOMAIN combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, ITEM, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!sym(group_var)) %>% - dplyr::summarise(var = var(!!sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × ITEM combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, ITEM) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(TEMPORAL_DO, MEAN_DIFFERENCE), "TEMPORAL_DO", "MEAN_DIFFERENCE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, ITEM, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check for missing data patterns -table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM, useNA = "ifany") - -# Check data balance -xtabs(~ pID + TIME + ITEM, data = long_data_clean) - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Number of TIME levels:", length(levels(long_data_clean$TIME)))) -print(paste("Number of ITEM levels:", length(levels(long_data_clean$ITEM)))) -print(paste("Number of TEMPORAL_DO levels:", length(levels(long_data_clean$TEMPORAL_DO)))) - -# Check for complete cases -complete_cases <- long_data_clean[complete.cases(long_data_clean), ] -print(paste("Complete cases:", nrow(complete_cases), "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$ITEM) - -print(summary(as.vector(design_balance))) - -# Check for any participants with missing combinations -missing_combos <- long_data_clean %>% - group_by(pID) %>% - summarise( - n_combinations = n(), - expected_combinations = 10, # 2 TIME × 5 ITEM = 10 - missing_combinations = 10 - n_combinations, - .groups = 'drop' - ) - -print("Missing combinations per participant:") -print(missing_combos[missing_combos$missing_combinations > 0, ]) - -# Mixed ANOVA using aov() - Traditional approach -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) -# Within-subjects: TIME (2 levels: Past, Future) × ITEM (5 levels: read, music, tv, nap, travel) - -mixed_anova_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * TIME * ITEM + Error(pID/(TIME * ITEM)), - data = long_data_clean) - -print("Mixed ANOVA Results (aov):") -print(summary(mixed_anova_model)) - -# Alternative: Using afex::aov_ez for cleaner output (optional) -print("\n=== ALTERNATIVE: AFEX AOV_EZ RESULTS ===") -mixed_anova_afex <- aov_ez(id = "pID", - dv = "MEAN_DIFFERENCE", - data = long_data_clean, - between = "TEMPORAL_DO", - within = c("TIME", "ITEM")) - -print("Mixed ANOVA Results (afex):") -print(mixed_anova_afex) - -# ============================================================================= -# SPHERICITY TESTS FOR WITHIN-SUBJECTS FACTORS -# ============================================================================= - -# Sphericity tests using ezANOVA (library already loaded) - -print("\n=== SPHERICITY TESTS ===") - -# Test sphericity for ITEM (5 levels - within-subjects) -print("Mauchly's Test of Sphericity for ITEM:") -tryCatch({ - # Create a temporary data frame for ezANOVA - temp_data <- long_data_clean - temp_data$id <- as.numeric(as.factor(temp_data$pID)) - - # Run ezANOVA to get sphericity tests - ez_item <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = ITEM, - type = 3, - detailed = TRUE) - - print("ITEM Sphericity Test:") - print(ez_item$Mauchly) - -}, error = function(e) { - print(paste("Error in ITEM sphericity test:", e$message)) -}) - -# Test sphericity for TIME (2 levels - within-subjects) -print("\nMauchly's Test of Sphericity for TIME:") -tryCatch({ - ez_time <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = TIME, - type = 3, - detailed = TRUE) - - print("TIME Sphericity Test:") - print(ez_time$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME sphericity test:", e$message)) -}) - -# Test sphericity for TIME × ITEM interaction -print("\nMauchly's Test of Sphericity for TIME × ITEM Interaction:") -tryCatch({ - ez_interaction <- ezANOVA(data = temp_data, - dv = MEAN_DIFFERENCE, - wid = id, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - - print("TIME × ITEM Sphericity Test:") - print(ez_interaction$Mauchly) - -}, error = function(e) { - print(paste("Error in TIME × ITEM sphericity test:", e$message)) -}) - -# ============================================================================= -# CORRECTED ANOVA RESULTS WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== CORRECTED ANOVA RESULTS (with sphericity corrections) ===") - -# Get corrected results from ezANOVA -ez_corrected <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = .(TIME, ITEM), - type = 3, - detailed = TRUE) - -print("Corrected ANOVA Results with Sphericity Corrections:") -print(ez_corrected$ANOVA) - -# Show epsilon values for sphericity corrections -print("\nEpsilon Values for Sphericity Corrections:") -print(ez_corrected$Mauchly) - -# Show sphericity-corrected results -print("\nSphericity-Corrected Results:") -print("Available elements in ez_corrected object:") -print(names(ez_corrected)) - -# Check if sphericity corrections are available -if(!is.null(ez_corrected$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(ez_corrected$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - # Get the sphericity corrections - sphericity_corr <- ez_corrected$`Sphericity Corrections` - - # Extract original degrees of freedom from ANOVA table - anova_table <- ez_corrected$ANOVA - - # Calculate corrected degrees of freedom - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - # Also show the corrected F-values and p-values with degrees of freedom - cat("\n=== CORRECTED F-TESTS WITH DEGREES OF FREEDOM ===\n") - for(i in 1:nrow(corrected_df)) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections may not be displayed if sphericity is met") - print("Check the Mauchly's test p-values above to determine if corrections are needed") -} - -# ============================================================================= -# ALTERNATIVE: SPHERICITY CORRECTIONS USING CAR PACKAGE -# ============================================================================= - -print("\n=== SPHERICITY CORRECTIONS USING CAR PACKAGE ===") - -# Create a wide-format data for car package (library already loaded) - -tryCatch({ - # Convert to wide format for car package - wide_data <- long_data_clean %>% - select(pID, TEMPORAL_DO, TIME, ITEM, MEAN_DIFFERENCE) %>% - pivot_wider(names_from = c(TIME, ITEM), - values_from = MEAN_DIFFERENCE, - names_sep = "_") - - # Create the repeated measures design - within_vars <- c("Past_Preferences", "Past_Personality", "Past_Values", "Past_Life", - "Future_Preferences", "Future_Personality", "Future_Values", "Future_Life") - - # Check if all columns exist - missing_cols <- within_vars[!within_vars %in% colnames(wide_data)] - if(length(missing_cols) > 0) { - print(paste("Missing columns for car analysis:", paste(missing_cols, collapse = ", "))) - } else { - # Create the repeated measures design - rm_design <- as.matrix(wide_data[, within_vars]) - - # Calculate epsilon values - print("Epsilon Values from car package:") - epsilon_gg <- epsilon(rm_design, type = "Greenhouse-Geisser") - epsilon_hf <- epsilon(rm_design, type = "Huynh-Feldt") - - print(paste("Greenhouse-Geisser epsilon:", round(epsilon_gg, 4))) - print(paste("Huynh-Feldt epsilon:", round(epsilon_hf, 4))) - - # Interpretation - if(epsilon_gg < 0.75) { - print("Recommendation: Use Greenhouse-Geisser correction (epsilon < 0.75)") - } else if(epsilon_hf > 0.75) { - print("Recommendation: Use Huynh-Feldt correction (epsilon > 0.75)") - } else { - print("Recommendation: Use Greenhouse-Geisser correction (conservative)") - } - - # ============================================================================= - # MANUAL SPHERICITY CORRECTIONS - # ============================================================================= - - print("\n=== MANUAL SPHERICITY CORRECTIONS ===") - - # Apply corrections to the original ANOVA results - print("Applying Greenhouse-Geisser corrections to ITEM effects:") - - # ITEM main effect (DFn = 4, DFd = 4244) - item_df_corrected_gg <- 4 * epsilon_gg - item_df_corrected_hf <- 4 * epsilon_hf - - print(paste("ITEM: Original df = 4, 4244")) - print(paste("ITEM: GG corrected df =", round(item_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("ITEM: HF corrected df =", round(item_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # TIME × ITEM interaction (DFn = 4, DFd = 4244) - interaction_df_corrected_gg <- 4 * epsilon_gg - interaction_df_corrected_hf <- 4 * epsilon_hf - - print(paste("TIME × ITEM: Original df = 4, 4244")) - print(paste("TIME × ITEM: GG corrected df =", round(interaction_df_corrected_gg, 2), ",", round(4244 * epsilon_gg, 2))) - print(paste("TIME × ITEM: HF corrected df =", round(interaction_df_corrected_hf, 2), ",", round(4244 * epsilon_hf, 2))) - - # Note: You would need to recalculate p-values with these corrected dfs - print("\nNote: To get corrected p-values, you would need to recalculate F-tests with corrected degrees of freedom") - print("The ezANOVA function should handle this automatically, but may not display the corrections") - } - -}, error = function(e) { - print(paste("Error in manual epsilon calculation:", e$message)) -}) - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -# Effect size calculations (library already loaded) - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Calculate generalized eta squared for the aov model -print("Effect Sizes from aov() model:") -tryCatch({ - # Extract effect sizes from aov model - aov_effects <- eta_squared(mixed_anova_model, partial = TRUE, generalized = TRUE) - print(round(aov_effects, 5)) -}, error = function(e) { - print(paste("Error calculating effect sizes from aov:", e$message)) -}) - -# Calculate effect sizes for ezANOVA model -print("\nEffect Sizes from ezANOVA model:") -tryCatch({ - # ezANOVA provides partial eta squared, convert to generalized - ez_effects <- ez_corrected$ANOVA - ez_effects$ges <- ez_effects$ges # ezANOVA already provides generalized eta squared - print("Generalized Eta Squared from ezANOVA:") - print(round(ez_effects[, c("Effect", "ges")], 5)) -}, error = function(e) { - print(paste("Error extracting effect sizes from ezANOVA:", e$message)) -}) - -# Extract effect sizes (generalized eta squared) -# For aov() objects, we need to extract from the summary -anova_summary <- summary(mixed_anova_model) - -# ============================================================================= -# NOTE: MIXED MODELS (LMER) NOT NEEDED -# ============================================================================= - -# For this balanced repeated measures design, Type III ANOVA with proper -# sphericity corrections (implemented above) is the most appropriate approach. -# Mixed models (lmer) are typically used for: -# - Unbalanced designs -# - Missing data patterns -# - Nested random effects -# - Large, complex datasets -# -# Your design is balanced and complete, making Type III ANOVA optimal. - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(mixed_anova_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of ITEM -print("\nMain Effect of ITEM:") -item_emmeans <- emmeans(mixed_anova_model, ~ ITEM) -print("Estimated Marginal Means:") -print(item_emmeans) -print("\nPairwise Contrasts:") -item_contrasts <- pairs(item_emmeans, adjust = "bonferroni") -print(item_contrasts) - - -# ============================================================================= -# INTERACTION EXPLORATIONS -# ============================================================================= - - -# TIME × ITEM Interaction -print("\n=== TIME × ITEM INTERACTION ===") -time_item_emmeans <- emmeans(mixed_anova_model, ~ TIME * ITEM) -print("Estimated Marginal Means:") -print(time_item_emmeans) - -print("\nSimple Effects of ITEM within each TIME:") -time_item_simple <- pairs(time_item_emmeans, by = "TIME", adjust = "bonferroni") -print(time_item_simple) - -print("\nSimple Effects of TIME within each ITEM:") -time_item_simple2 <- pairs(time_item_emmeans, by = "ITEM", adjust = "bonferroni") -print(time_item_simple2) - - -# ============================================================================= -# COMPREHENSIVE THREE-WAY INTERACTION ANALYSIS -# ============================================================================= - - -# ============================================================================= -# COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS -# ============================================================================= - -# Cohen's d calculations (library already loaded) - -print("\n=== COHEN'S D FOR SIGNIFICANT TWO-WAY INTERACTIONS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - - -# ============================================================================= -# 2. TIME × DOMAIN INTERACTION (SIGNIFICANT: p = 0.012) -# ============================================================================= - -print("\n=== COHEN'S D FOR TIME × DOMAIN INTERACTION ===") - -# Get simple effects of TIME within each ITEM -time_item_simple2_df <- as.data.frame(time_item_simple2) -calculate_cohens_d_for_pairs(time_item_simple2_df, long_data_clean, "TIME", "ITEM", "MEAN_DIFFERENCE") - -# Get simple effects of ITEM within each TIME -time_item_simple_df <- as.data.frame(time_item_simple) -calculate_cohens_d_for_pairs(time_item_simple_df, long_data_clean, "ITEM", "TIME", "MEAN_DIFFERENCE") - - diff --git a/.history/eohi1/readme_domain_mixed_anova_20251002121520.txt b/.history/eohi1/readme_domain_mixed_anova_20251002121520.txt deleted file mode 100644 index 071e72cec820a1c4824c687364abe23e723f97a4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 69326 zcmeI5|Bh72b;n!sf06PA`D21*ft?KI_m9=u5n+K{1r`jfePlQhm*s}v&q5a z)ntEiG})SbVt;@7-k&G`IsLyixnbo;lO3z|8>{tZ^3rNNn|y8WJh!VOE8CsCpS-iX z18a-2{Z#vR_Wz~bA6m&@PyVWCHSgg=T??uj{1+x?C+E^%i3>&nHnt{v2@}r^Tj%W# z%marzWrMoj({pcJ1ko;qhz3*^XUr+uyF;y|z2>yJc6;Q;kD|;b3yg zN`5r?sr^5daK2-;sqv(E>q*xBldS!-_TJg_amfCCvMC=R+M0gA*-eA{$VT=uao79F z7lw@&cI|cYZ}{{}dv-K=Wc>ErXz6|(Z#=cSJjMEK*`Mp5!}Q)=dx8#e9|4{6dy|h>RSTlVDg#$ zhX(Y{-;Ff7(&H$xz`LiXZ=XJX@3G+p$k0)Y=j+Kg$IyIj8sN9~6j|G{Qby(U5_B&N z62?Z$-QI$qP8-f0PS-k&54`N!w?A7QETMC1upFi_1RRS{WiL_qQX0!5@83)GMBo15 z7$0M4n!h_gWuYn@1s9B`uKR!oM)^KQG`<%6Q@Qj|@ zHIn}0L=QYp{1D3APXTkX?f6PxPVP^BZr_Kgw;hAxo%x%RRx~JM_tNK+KPD}xGO!fS zE~O`ooIdXv1V;utl9KmFyI=*+2ubQ*<5b2{sd@?4iN4&cXcM?Z=`4sCqW zDxxqRA{3|I;nZe0wsUQ@ov4n!WxRM;4&faufe(BHw9oxZ!w&F(rQl6PUO*b2D3Ch- zz*mg8tOY*3*GJP{;KtWR4MwwV^hWn%Lu4!Pi?OHj2y=+=LFr4-KCy4waeJP>m-p&2 zfK2RIeY`?2%h(y``_vxep`T-Z@jd+U`UueW3iKu_tbQJH5sLhvGX@p zV|lytYpeO~kZb0ep3<{}WBBX3rsTx4srF67&0E8iA{MauE5qem`-Zo`F_1zBX(Q+$ zE2{YHz@ABN@zSXK!v4RXwvl^Za5h;4XyuX{B7uNTImAckI&X7N~D}7$}MO)7e#urxWRpMr(`Wt%#xSm=c@)y+`RvnG}(EdEj zAxv#qEJZw}C<0g?THTjveDXG>10JO)1GsYShnAt`Z|o1P#v9Ti@2w~KAjp;8qvoNN zYCL!zd3)}O%=aB0PrSz{xWkV-Op>=_lsucL)rE!(1ho_wgE!|PwE$1-B%Sj|`#!MR z-nz$mi?l0P70I<)SDb!in&!3jI)5*%*8j!gzh4`6P96KK(rm&P6hXg>J7hc0TV#AC z(f<$jeQk2x-USQM(wBy}*U5IkWnU-RfnTxv#D-+%BynYn-F9x6)L;XUq0;ZJXP4k+ zdU<946-C4SSR-U?ck;JANbv=g|8xAb$9>K62Io?)fpK?XUDlXCv$5v+OXW$h3ySH) zonTRtjbuHx?8-gMo`fy^%)Fg@rtiNnzvq)FtlziiH|!4@c$NXpc+LL)<_G_6|Ibfz z0q|qA$z9%_xV8L`luLf0YfcAwXd zef&k+g70$bP4HdHdZwg~zr6jxA6D^7!XG`9@ZB|xLEZ9A?gBp1C9uacvW&u!t^;I1UbK7XzssiUy*JTx9{dIW@m3eU zar}2}%5B1ouyP?mENUVxjx|agPpYXWqJ!58{6XIfiN05R@Qvla%hn@r>qD;>tOS_2 zzHHbm@#nZ!&3MbI>B^(3KI|@-42nSIvhOpC*aSobNGV) z!V_4`n%y$n^}sCHL$gYmn7&(jq3rv*aisIA`nzoKxZDAmH1}4rE#vxb%Y51Q z)k@!2vTAo`YyxtzHmIUYt~*@#x7(sc45lrcs%3#9-?>jechla z?J5#2JtEC>E{$9EZQQtr=^cGvvwliG$D`C3>HDH_8#q5dg=Y-Thv}`pCEvNd7L7#% zUBid!TfATIyX5n+^hUl*dSA5Bx=y`f7`Q&|TY7C=-{a^F&cjni#x7aU(KmM*xAfYW zzQ@x0veC1&W6(omM85Gi)wlH8n7+r-`&`mtF5iqBsq=A@YZ0&I9+q-a%6=&g`5Vy>uck{T>`|%X;SZt;HRiP3>C;@}c$EJe2@9}jX@j%P@g7Vif=GG0$E-VZvX+s`WKgS9OC8Pb1? zj|aIe`&kWrLgU>}nThPAl)|YQ?85iXRNGKCSV@DS;FW!QNb7)Q0~!QeCR)}z4!cE zFxKD)MtNHE=-d)U@aP^4zKp|C;xXW%JbXVd2S&dxzkyLvE-jDM*8!twD2j5ec^382 z#iPTO<20ZV9wSzWC#-c89_P{Z0V8mDew6Dx8_Ip~WAsV6!yBbz@Hmg1C}H#*so;%_ zDECfx;ad>3V62yK**Iwjxt_=St}#&x~@P<1w!LaIh9$ZS|jpTw{^`^gXShM8fdgmAA)B&*wAo)bVez3`c36 zM!rKn(*kT+!{la(~IAvI>&}P9gN)2?75e>1_jJp-*LW7w?A`X5r9m%s%cGygv~#yT{F$0wi+&c< zu-$&hj1JRhF)i8e=gN$JLN9%8S_eJHek#_rx#rPokc{U6$l z72H~5U1kerX)U-@xrd9%t}(7wwt;nfta1Y8Cl>kOv*#uI$1M!y5o+3|rfFKUwm^6x zQ56Yx|MHqa!m3owb;zR)&0o&zi+0E@nR;1uGEHhhc%bd!sPR(LWP(9$ol7lXJJFY5 zb~qjcTKV}~NsFMfau}AyF(o})T;n;cSsVjyyBy>13bQ3w%*!Xi3u-Orm%1JDdSpf= z>+(A;CTubA* zb~)D8hyo6+7uE{SfQeSh9?JO;o`ZL!=h|_6z@28OODO);aN}6NZ)?0C873c^T;4Jr z1vR(RO0QqqHS^_g;eD&aC|mH);(q7tQf9PI=4tX9^R=;pZES93UcYePG&-K3JiY@P z)r;i&=J(`vSsV7mEQCYmI4-qtdFQ_oZGGy`#y?u?*{f^`hDC=p~;QE_)I6=kW%4 zUl2wd-M#ldyypDApu_Gn*l`~iEPG)N7`n{k33{tOR~`2rYepYF7U}JI{6KHjXQks_ zvsif_-izomj~7TQqC@z}aD&fR^?SKE<>as>xn}#^bQil??wVfvm8}ZLR^+-3U8Z#^ z57X6GR`YSHwOakdP83!|A0+!hTpBtC?XQ*YtjEQ_dSX5d_+Xxm{TkZAt=%+7iC@4= zsfEhB5^v?*Y<)!Ad)9Nl%9S=2(NksLqIEz2M>#X#PzyVcb;{pdM0e%yJC~vV%8z=M zN7L$gUVYZ@;anP9zx-md+wC#WtK<4ToHN~DmI@ch^$@m2i*c(zyX~uoe0_`80NaWb z+GCzq7kb7d1Ta9op?%Fy*71jAA1e)MtQ|dJL88&ent%Zzz3bC z%rsZbn-Sq?KLN21p0?_+c9*s7*gIJm{0U|NHQ#pLGD57m(%fC&4+xlAdZG`H-oanR zCuS`nIZ~|1dF!VOPt+!|xsrT5U+sv8G42zxP5lx-lqieNZW`8kv93@Vb#MhPSYHT6 z)f+Rq^0vX7V8++#Td^8+rCw3*K*0J^jfiXL#cH^V1{1kz!OXlT^9n$ECE<+hA$TG0 z1T@4S8vmf85B zXEJ416BghdcpSKE_@Op)7w|Q7ypU>vA9$|yagO=K zY&NFB1-f7yUnnO8W*8~^V!#Rd6nn#e#tzoduUh#IUcnHU0EavWo8XAO5a?h2gK6nm zY>2cFIvR}A)9UdEF0g;&aIt!L*4x8*!@BGcRu5dD>jM{Lrl1-$6dynpctO4yH0KXn zfX>E(;$=WV-o*oh@}fVrptHC~Ttx58ox>OFqWKscgA44wZ~+$ZTo@fzkFf$b+zwBJ z0oicwv5t%z%z!7}mhTFdIs+YxZ%1z zN4C)u;0r8K2Mz%<@DVuCtcJ9&v<6&2Ei~jbPmTxS4PMcVPvi{9=8*4rYcck3 zEO+tVW=9UKoyQh;>x2odnc>vfZq~+U$@5;@*JW0rTkjV=oOf%)eHpJXPCR;i3&{uA zD6!m%ImWb<(;#5bRV!uzCb$*71_z2u7Kwi9Ep_4eckCYW3%R1dj@N{v|*0_C}g13qdkiXW&T{|N-rllO`U6_Wf6VRZo zw08%X7PU_Omb!3!4EOuCN)b-@^|7Dw@yI_B6dpBm6~tW5p&B7^EEW$Q#@3Mufac2S zD54>H#rslxt1N)5BUV$9k4ojSDw@UHrBtyX(IWk?*MXb%}IGxBMu3%<~7>!Y4Hgb|`yLNe>I<>$#(P&>C_u9G~fQg%uisgMA#m4AcL ze(jnb_O{`0;z~ZT7A|kouW7qq+n;_+-%t(8e4P4JF#y>qbRxVL8A5a)Q35(w`V{?v zhCqKv8xkidnniArH6)SR_Y93Zx#2 z=?|+QT`bPi|z|^1kDi{1i&J*;J)%z|-pxx(&kTCWCx)7fwE<3sbQn7i z;0bmdJhJ{&m$&%KWj#)>?`j^_KHj!EcxBD=1=WxD3J=~cYNEz_)DO#V4(S=N8wv8ylbPp3ZX3I60QtIK&^kL*;huj~px z^0TS#q22C!eR9*RBxj!%Cz`My0*iUeGSRF9f0XtavxZxmRqunJ*`Lx2_4@T$aewOH zt<}=tR304>7Pu=tI)7i^bPC`|o~9R{~fqpz|np> ziP)2%-`sd_<@y~kovO*n2Klcd?%JG>_D8S>f_~qbH@cVhOU4cf!NY&;7tnqn-$hd5 z;#OMM!0$7$uSwF&nAmke3)&xYFTuecj+jUCv5B7fFGD$nk^fr;C!by0Ly@1kch>NH z*Gl5FJ(U1mcVD!4LaTb5Ka_uG4LH_0KgWWT6SPxGySHi<6B|@YAHDZ?1o7^@ z)Sg4asRHbc60O+@M87)gh5b3cpVoh0O#6a-2MlXs`g^jJtzhPn-}KTxA3sedMg_Ii zFXIwjpc9an(#gJfe)*Hr6LQHOI!ou_CDnd62A(2I1s8D*myjN@2Y93H*v z9ylC~x5n?hc7J_{nwsqz(@Vc!viKgyv`@027v-|Y^rGD?QblSAPm#ydPvWV7iOOVdUgm+kn>YPW+jh6SU z6l2&nNrTJCnsVRO|Mb#^pe;K?ap!@;^eky8IXu<^W0F^J+GVxiwAFY@SkM>xJD=Xc|I z!Y}Mz=QJ4o_M!h~YMjzQWJ{!jW;wKLMmUeR(~5;RE5hsly0Y_zzL%>n@krtqrr=Vv zPX2p1ZiVq1LME9p?s~@Z5STLtQtt3x@Tj5Gs`yb^-Fj42a);;F>U`kmxdk6{W*>FQ zwULYC%(yRYWNK%#xF$w=s}na7v5w6P)Z$g<{rnUX=(=3rx#Yj#^zwW)ag%nK1rM?= z0Up%+ufGf5)qGsMYj}|NOAhxp>wRKn@;>p|mAjDrE*taqh$uE_2$_&4w&MbDEq*K>t9?#dGY92I~KA#EOoSck!k=0W< zXUZQfb27}F5yfgwK<||~nPDuroNOizw5qPe0jETiI3O>@Yy#S5NrE5h!)g%tn zJseFv)*8B=Eytbf^i3SQkCv!0{K(qa$&GbatbjALfRBA0Hm2m{@EzS3s+~Rd-SZe7 z8}N5&r`dz|d!K^W zJ;u=3dR}Ht{2jh|4|hiYZ*2m zpjqdKp{2!y2i7byuZrFprUmmj%g)~iE#Ne=_H7t~7R&|L@PKwZsh3Z?wa=!=FOmyS zVnN!tLVk&FnMtqX3O%vNyqmZbo?$;$L}l8`RO%3qOr&cBE!X&avbRGp4D+paSc!Mr zcz|a(RaE+>#xq0ua%k=iJTPXs7Cq92L3syi`+BOHE^X_OVLWKh0{-kQ`PDic)vM|A z(yJRc|0el`5AeFcM;i~&0v&=UTj$C3`$7*4-u2JAjmz?Qq}cLWa^J>tmwA~omURgm zlkr%##`EY5&MM;xgFYX_Lp3baiOxvhsC1Xjvn|yTezB-dke_ zn#e0rpLMIzdp@s`kG*D>9DZzJiFx9q+SrMuOUyI7ryV=(vFG+6VDjGaR`Ymli0*EO z{9P#FIkgtuXZOyWU0}X7c5;R#=5<=%Fx}lAl;h@3IYI5Kh~8^-_e>to^wGP-vfg{0 z?!&#~A?LQB&hzv|W)Yt|dMDml$NC|yCi_b+@x!exaH@B|m+iN=vjlD3>A59zRYMV#+$>dLB9L`eA_w^N3?Z_YYsmLLy}yBKsAwBNs{CW&FixtE&DheBYSMDGjAN8 zJ_MfEA_FHq*7ax?%~7aNG6c%44&i|80P`93$b1-rwl;GlsgRpbUVcT?ws zlGekx51DW(Fh3JBn|%m8Q^c%JofFaMt$7tzQiPow6JrE~<<;g$FP4OOEazPA%GLdf z#+AWdzVtFD8F@~q)`h_BwdXmPE4{>>%oMb70N=3w0;?Xy?q=oKY#}1 zNu3IHKXX`$Tzc_Fh9TItv6JD%AE`rd66u8kxgXx=1E@fBj10;T?ot8iUH&SVm~Ei_ z2R8P1YhS}ZE}K}7$v;$XalI?a#K-W4XUtrd$h0Y<^5{)<(PxS!sfqn$rU4rB*Uzlg zJ)v=SV7VHozC2FSL1MciJNZaJ1huqOKHZ<_;{$7ji5i2R`;yYtwpm(a)^lyq3@H zj}t4tusDFz7am)Q&I8=GRW3K|7r(dcS$$6mznRJJJO3)Z|Hx|QyHuj@TXXIXiQQM6 zC$VSkvOfH^?c35B6YS7u*H(GHL|r?FbwP4VtwV`EA7Xz=D-WkMaVI>+`}OQexiV}R7UVoKc*FqcbcmV%nQpnyu=!Lpb;#7(hT=?ymV<)_dnGhJW#^Y!i!EpV1-}RuILK*K$4i$+!>}mc30T^A(Hh#|7qS_%GVk-if?P&U za#)VwUZ)YTLp5J9JC0v+FTz`ju`#l7{DQY4KaRHr%e?$D3=4QEWxB0j$jJL>94{f$ zWB6ql7IG|koHnFi*tMR+LLPZm*1F`EVOSVzX#-k#0g{qNoShuJTgOYOU)<~R5ydxh zN$#hlQB97h$;zsrk=}KG5B?dWUuMCA1N^O;Eb zNja?McWowz!BhB8!yJYl#9Kszt0t>4ef7m`jJaiNaoD#9Upj>jNehX@Ui3N4XMtA1 zVYzp-nS6yGT*jl?H(ufi&E;nuI7ZQ3ael$DBo6b{=3|MKE$Db3JGar;H3|D^HvE&P>-!8~3Yp^6E45aXBZI-y#_6DW6HdTkxs9BinG{_J$!(ugUSj`j;3K2+NzPww#eN!-&i_ESE4Earb`7zXlU>@{lM?GUhtC6H^+zk7 z0F$oceZ2anU!BB3J_sF(UmVZ?C#x2*)WpOA&1Rg;fxO+c*ot4jelz`bY|e&)@AmBf zTl*g;Nxw*^vAwqEQ1%=9-c3(;ru;=quM?+6?8bR+Lzw+Hd!Ex8h`{X|!BCYZgR)iHJ3qaoQ?m|Frwd2 zdTLkCkK5cgJ@d}$?-|aXrxU@&1&V<}|1XXs=RhAti6LeH`WT|r>$fpPCW`!#pM}MV z|1Z+{d(V>Y8IPeuqrc{ayw_p8X!RL&Kb+p{o@EC2&7<$Auk+By=)9cQSrJz1mNJ_p z4k8nzKjv0kyP)@w3*VuL<{^qgA4xZD#)&D=w$FLX)-c0ZMaSfrb>1u`sIFtiNf1#f z{$WVlq7t--j*%}Kv)1!_P~oH0@?e|ox3uPr7m?$6BzefHwdP3WFJO!CD2Sy(qRE`` zr)M5nu=Y@AjWC|!eSyQ4w!VJH^$9wLT?T@{OUJ;3&S8cK&&S`x>iKVT4dE$|jit5p zUwG0!IpHbh6UM+aynAjKPab9E@c78^@W2rtd3Dm!V_;Ig4@}Y8y4LR1Gp=aB^=RfH z>Z6zIlG4}E?!}cpj+c75^@lmYyoCTrpMk1%n2>R9-eAH%xgu`6JICOlM5DH)5ko7sf z+S>hvSvXwHc}OLMzWP@y@yOlA9Kb9bf{9g}mQ;ffDnlt{g2wV;P+9vb>QtUzQu0XB zB?W&_Cl+bV(O4><>;$8hq|_kH&k`C>WpIcujcwMPM%Qm$Oj1e+MN#?r#1VYPQdxP` z<)1uMLgTqB2n&QA~{wON}zTW63p64Z=93Narw(<3p(YT|KqPJ49IxlLY$&PHFe zGvGtc)U~q@ZJd?TwrVVaBO=z5a~8kCyleBlZ*5-F={a^rvyG>=Rx0IjwH$@^hLboE zV>7d&Z9J`)pIA+ztiSWXqO*I;*Hk6Pl##<~`A1ruSdA6AC%0}Vk+nr;JnJ(f-@f_J z+OFAEW*3L&+CwslQx$(==gB{?=PBPsKqO7PuV~u$>!X!4N45z&w2^o;`D)|CVVokz zj#t+}<*W}Hyg%#XE90R=e+)h|MrKF(D{p&I?Y79Q51CPqLFs=JbN6*k0aoXcv)URV zVZ1&Es$6r++blW8b3%DOlE_sVe3wJ*1>jAsP-X>@?;X|YHk7+2AA-ucG5XcjFjQoi z`Ge-U9$;N$4Jyyz@{Y&gEqq}4N?RDPwV)DS&`rn>wPKcI2v=K%C32z>y9`0K7_Nwn zv?@PeBQgY4e!7_Ws>Da>8w7UBs}`%y5c%)V zI?RF!E+>QOJ6i)PG82?CH&k+aoar@N2HO-If{NKOW-j29Smncg9V&1Yr))V?Z6CO# zu44q6)S+^2hZdSQ8v_;X#>rc=p;`=AT2Z5QQ6*p1afP0X)3{E;?R8vvh5}AkmZH>G zb*P{qt9+nlY4tQ^d$s~w`+M<7Vy2w-A6Q9Xk9Jq{i{bRu_mGyoN_EJhwlp8UU#^p# zgU0xOd0gKg9oQD1qtSJ)wYR>g2e|zBb_i#oYu14!@=2`uJ9(!2B#|z3tgnh1YbjQX zD{L>^#C-U-JFB#MsRKjItZ$ywmA*ykIj)v}q>+dRh+a1}gWJw@59@LFHk{(67H7nZ zMDfBIJ_nx1$@e*ZSS`Lv91-c^qkY>nQz?&!FkQN&gR^5HqC! diff --git a/.history/eohi1/readme_domain_mixed_anova_20251002121755.txt b/.history/eohi1/readme_domain_mixed_anova_20251002121755.txt deleted file mode 100644 index 071e72cec820a1c4824c687364abe23e723f97a4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 69326 zcmeI5|Bh72b;n!sf06PA`D21*ft?KI_m9=u5n+K{1r`jfePlQhm*s}v&q5a z)ntEiG})SbVt;@7-k&G`IsLyixnbo;lO3z|8>{tZ^3rNNn|y8WJh!VOE8CsCpS-iX z18a-2{Z#vR_Wz~bA6m&@PyVWCHSgg=T??uj{1+x?C+E^%i3>&nHnt{v2@}r^Tj%W# z%marzWrMoj({pcJ1ko;qhz3*^XUr+uyF;y|z2>yJc6;Q;kD|;b3yg zN`5r?sr^5daK2-;sqv(E>q*xBldS!-_TJg_amfCCvMC=R+M0gA*-eA{$VT=uao79F z7lw@&cI|cYZ}{{}dv-K=Wc>ErXz6|(Z#=cSJjMEK*`Mp5!}Q)=dx8#e9|4{6dy|h>RSTlVDg#$ zhX(Y{-;Ff7(&H$xz`LiXZ=XJX@3G+p$k0)Y=j+Kg$IyIj8sN9~6j|G{Qby(U5_B&N z62?Z$-QI$qP8-f0PS-k&54`N!w?A7QETMC1upFi_1RRS{WiL_qQX0!5@83)GMBo15 z7$0M4n!h_gWuYn@1s9B`uKR!oM)^KQG`<%6Q@Qj|@ zHIn}0L=QYp{1D3APXTkX?f6PxPVP^BZr_Kgw;hAxo%x%RRx~JM_tNK+KPD}xGO!fS zE~O`ooIdXv1V;utl9KmFyI=*+2ubQ*<5b2{sd@?4iN4&cXcM?Z=`4sCqW zDxxqRA{3|I;nZe0wsUQ@ov4n!WxRM;4&faufe(BHw9oxZ!w&F(rQl6PUO*b2D3Ch- zz*mg8tOY*3*GJP{;KtWR4MwwV^hWn%Lu4!Pi?OHj2y=+=LFr4-KCy4waeJP>m-p&2 zfK2RIeY`?2%h(y``_vxep`T-Z@jd+U`UueW3iKu_tbQJH5sLhvGX@p zV|lytYpeO~kZb0ep3<{}WBBX3rsTx4srF67&0E8iA{MauE5qem`-Zo`F_1zBX(Q+$ zE2{YHz@ABN@zSXK!v4RXwvl^Za5h;4XyuX{B7uNTImAckI&X7N~D}7$}MO)7e#urxWRpMr(`Wt%#xSm=c@)y+`RvnG}(EdEj zAxv#qEJZw}C<0g?THTjveDXG>10JO)1GsYShnAt`Z|o1P#v9Ti@2w~KAjp;8qvoNN zYCL!zd3)}O%=aB0PrSz{xWkV-Op>=_lsucL)rE!(1ho_wgE!|PwE$1-B%Sj|`#!MR z-nz$mi?l0P70I<)SDb!in&!3jI)5*%*8j!gzh4`6P96KK(rm&P6hXg>J7hc0TV#AC z(f<$jeQk2x-USQM(wBy}*U5IkWnU-RfnTxv#D-+%BynYn-F9x6)L;XUq0;ZJXP4k+ zdU<946-C4SSR-U?ck;JANbv=g|8xAb$9>K62Io?)fpK?XUDlXCv$5v+OXW$h3ySH) zonTRtjbuHx?8-gMo`fy^%)Fg@rtiNnzvq)FtlziiH|!4@c$NXpc+LL)<_G_6|Ibfz z0q|qA$z9%_xV8L`luLf0YfcAwXd zef&k+g70$bP4HdHdZwg~zr6jxA6D^7!XG`9@ZB|xLEZ9A?gBp1C9uacvW&u!t^;I1UbK7XzssiUy*JTx9{dIW@m3eU zar}2}%5B1ouyP?mENUVxjx|agPpYXWqJ!58{6XIfiN05R@Qvla%hn@r>qD;>tOS_2 zzHHbm@#nZ!&3MbI>B^(3KI|@-42nSIvhOpC*aSobNGV) z!V_4`n%y$n^}sCHL$gYmn7&(jq3rv*aisIA`nzoKxZDAmH1}4rE#vxb%Y51Q z)k@!2vTAo`YyxtzHmIUYt~*@#x7(sc45lrcs%3#9-?>jechla z?J5#2JtEC>E{$9EZQQtr=^cGvvwliG$D`C3>HDH_8#q5dg=Y-Thv}`pCEvNd7L7#% zUBid!TfATIyX5n+^hUl*dSA5Bx=y`f7`Q&|TY7C=-{a^F&cjni#x7aU(KmM*xAfYW zzQ@x0veC1&W6(omM85Gi)wlH8n7+r-`&`mtF5iqBsq=A@YZ0&I9+q-a%6=&g`5Vy>uck{T>`|%X;SZt;HRiP3>C;@}c$EJe2@9}jX@j%P@g7Vif=GG0$E-VZvX+s`WKgS9OC8Pb1? zj|aIe`&kWrLgU>}nThPAl)|YQ?85iXRNGKCSV@DS;FW!QNb7)Q0~!QeCR)}z4!cE zFxKD)MtNHE=-d)U@aP^4zKp|C;xXW%JbXVd2S&dxzkyLvE-jDM*8!twD2j5ec^382 z#iPTO<20ZV9wSzWC#-c89_P{Z0V8mDew6Dx8_Ip~WAsV6!yBbz@Hmg1C}H#*so;%_ zDECfx;ad>3V62yK**Iwjxt_=St}#&x~@P<1w!LaIh9$ZS|jpTw{^`^gXShM8fdgmAA)B&*wAo)bVez3`c36 zM!rKn(*kT+!{la(~IAvI>&}P9gN)2?75e>1_jJp-*LW7w?A`X5r9m%s%cGygv~#yT{F$0wi+&c< zu-$&hj1JRhF)i8e=gN$JLN9%8S_eJHek#_rx#rPokc{U6$l z72H~5U1kerX)U-@xrd9%t}(7wwt;nfta1Y8Cl>kOv*#uI$1M!y5o+3|rfFKUwm^6x zQ56Yx|MHqa!m3owb;zR)&0o&zi+0E@nR;1uGEHhhc%bd!sPR(LWP(9$ol7lXJJFY5 zb~qjcTKV}~NsFMfau}AyF(o})T;n;cSsVjyyBy>13bQ3w%*!Xi3u-Orm%1JDdSpf= z>+(A;CTubA* zb~)D8hyo6+7uE{SfQeSh9?JO;o`ZL!=h|_6z@28OODO);aN}6NZ)?0C873c^T;4Jr z1vR(RO0QqqHS^_g;eD&aC|mH);(q7tQf9PI=4tX9^R=;pZES93UcYePG&-K3JiY@P z)r;i&=J(`vSsV7mEQCYmI4-qtdFQ_oZGGy`#y?u?*{f^`hDC=p~;QE_)I6=kW%4 zUl2wd-M#ldyypDApu_Gn*l`~iEPG)N7`n{k33{tOR~`2rYepYF7U}JI{6KHjXQks_ zvsif_-izomj~7TQqC@z}aD&fR^?SKE<>as>xn}#^bQil??wVfvm8}ZLR^+-3U8Z#^ z57X6GR`YSHwOakdP83!|A0+!hTpBtC?XQ*YtjEQ_dSX5d_+Xxm{TkZAt=%+7iC@4= zsfEhB5^v?*Y<)!Ad)9Nl%9S=2(NksLqIEz2M>#X#PzyVcb;{pdM0e%yJC~vV%8z=M zN7L$gUVYZ@;anP9zx-md+wC#WtK<4ToHN~DmI@ch^$@m2i*c(zyX~uoe0_`80NaWb z+GCzq7kb7d1Ta9op?%Fy*71jAA1e)MtQ|dJL88&ent%Zzz3bC z%rsZbn-Sq?KLN21p0?_+c9*s7*gIJm{0U|NHQ#pLGD57m(%fC&4+xlAdZG`H-oanR zCuS`nIZ~|1dF!VOPt+!|xsrT5U+sv8G42zxP5lx-lqieNZW`8kv93@Vb#MhPSYHT6 z)f+Rq^0vX7V8++#Td^8+rCw3*K*0J^jfiXL#cH^V1{1kz!OXlT^9n$ECE<+hA$TG0 z1T@4S8vmf85B zXEJ416BghdcpSKE_@Op)7w|Q7ypU>vA9$|yagO=K zY&NFB1-f7yUnnO8W*8~^V!#Rd6nn#e#tzoduUh#IUcnHU0EavWo8XAO5a?h2gK6nm zY>2cFIvR}A)9UdEF0g;&aIt!L*4x8*!@BGcRu5dD>jM{Lrl1-$6dynpctO4yH0KXn zfX>E(;$=WV-o*oh@}fVrptHC~Ttx58ox>OFqWKscgA44wZ~+$ZTo@fzkFf$b+zwBJ z0oicwv5t%z%z!7}mhTFdIs+YxZ%1z zN4C)u;0r8K2Mz%<@DVuCtcJ9&v<6&2Ei~jbPmTxS4PMcVPvi{9=8*4rYcck3 zEO+tVW=9UKoyQh;>x2odnc>vfZq~+U$@5;@*JW0rTkjV=oOf%)eHpJXPCR;i3&{uA zD6!m%ImWb<(;#5bRV!uzCb$*71_z2u7Kwi9Ep_4eckCYW3%R1dj@N{v|*0_C}g13qdkiXW&T{|N-rllO`U6_Wf6VRZo zw08%X7PU_Omb!3!4EOuCN)b-@^|7Dw@yI_B6dpBm6~tW5p&B7^EEW$Q#@3Mufac2S zD54>H#rslxt1N)5BUV$9k4ojSDw@UHrBtyX(IWk?*MXb%}IGxBMu3%<~7>!Y4Hgb|`yLNe>I<>$#(P&>C_u9G~fQg%uisgMA#m4AcL ze(jnb_O{`0;z~ZT7A|kouW7qq+n;_+-%t(8e4P4JF#y>qbRxVL8A5a)Q35(w`V{?v zhCqKv8xkidnniArH6)SR_Y93Zx#2 z=?|+QT`bPi|z|^1kDi{1i&J*;J)%z|-pxx(&kTCWCx)7fwE<3sbQn7i z;0bmdJhJ{&m$&%KWj#)>?`j^_KHj!EcxBD=1=WxD3J=~cYNEz_)DO#V4(S=N8wv8ylbPp3ZX3I60QtIK&^kL*;huj~px z^0TS#q22C!eR9*RBxj!%Cz`My0*iUeGSRF9f0XtavxZxmRqunJ*`Lx2_4@T$aewOH zt<}=tR304>7Pu=tI)7i^bPC`|o~9R{~fqpz|np> ziP)2%-`sd_<@y~kovO*n2Klcd?%JG>_D8S>f_~qbH@cVhOU4cf!NY&;7tnqn-$hd5 z;#OMM!0$7$uSwF&nAmke3)&xYFTuecj+jUCv5B7fFGD$nk^fr;C!by0Ly@1kch>NH z*Gl5FJ(U1mcVD!4LaTb5Ka_uG4LH_0KgWWT6SPxGySHi<6B|@YAHDZ?1o7^@ z)Sg4asRHbc60O+@M87)gh5b3cpVoh0O#6a-2MlXs`g^jJtzhPn-}KTxA3sedMg_Ii zFXIwjpc9an(#gJfe)*Hr6LQHOI!ou_CDnd62A(2I1s8D*myjN@2Y93H*v z9ylC~x5n?hc7J_{nwsqz(@Vc!viKgyv`@027v-|Y^rGD?QblSAPm#ydPvWV7iOOVdUgm+kn>YPW+jh6SU z6l2&nNrTJCnsVRO|Mb#^pe;K?ap!@;^eky8IXu<^W0F^J+GVxiwAFY@SkM>xJD=Xc|I z!Y}Mz=QJ4o_M!h~YMjzQWJ{!jW;wKLMmUeR(~5;RE5hsly0Y_zzL%>n@krtqrr=Vv zPX2p1ZiVq1LME9p?s~@Z5STLtQtt3x@Tj5Gs`yb^-Fj42a);;F>U`kmxdk6{W*>FQ zwULYC%(yRYWNK%#xF$w=s}na7v5w6P)Z$g<{rnUX=(=3rx#Yj#^zwW)ag%nK1rM?= z0Up%+ufGf5)qGsMYj}|NOAhxp>wRKn@;>p|mAjDrE*taqh$uE_2$_&4w&MbDEq*K>t9?#dGY92I~KA#EOoSck!k=0W< zXUZQfb27}F5yfgwK<||~nPDuroNOizw5qPe0jETiI3O>@Yy#S5NrE5h!)g%tn zJseFv)*8B=Eytbf^i3SQkCv!0{K(qa$&GbatbjALfRBA0Hm2m{@EzS3s+~Rd-SZe7 z8}N5&r`dz|d!K^W zJ;u=3dR}Ht{2jh|4|hiYZ*2m zpjqdKp{2!y2i7byuZrFprUmmj%g)~iE#Ne=_H7t~7R&|L@PKwZsh3Z?wa=!=FOmyS zVnN!tLVk&FnMtqX3O%vNyqmZbo?$;$L}l8`RO%3qOr&cBE!X&avbRGp4D+paSc!Mr zcz|a(RaE+>#xq0ua%k=iJTPXs7Cq92L3syi`+BOHE^X_OVLWKh0{-kQ`PDic)vM|A z(yJRc|0el`5AeFcM;i~&0v&=UTj$C3`$7*4-u2JAjmz?Qq}cLWa^J>tmwA~omURgm zlkr%##`EY5&MM;xgFYX_Lp3baiOxvhsC1Xjvn|yTezB-dke_ zn#e0rpLMIzdp@s`kG*D>9DZzJiFx9q+SrMuOUyI7ryV=(vFG+6VDjGaR`Ymli0*EO z{9P#FIkgtuXZOyWU0}X7c5;R#=5<=%Fx}lAl;h@3IYI5Kh~8^-_e>to^wGP-vfg{0 z?!&#~A?LQB&hzv|W)Yt|dMDml$NC|yCi_b+@x!exaH@B|m+iN=vjlD3>A59zRYMV#+$>dLB9L`eA_w^N3?Z_YYsmLLy}yBKsAwBNs{CW&FixtE&DheBYSMDGjAN8 zJ_MfEA_FHq*7ax?%~7aNG6c%44&i|80P`93$b1-rwl;GlsgRpbUVcT?ws zlGekx51DW(Fh3JBn|%m8Q^c%JofFaMt$7tzQiPow6JrE~<<;g$FP4OOEazPA%GLdf z#+AWdzVtFD8F@~q)`h_BwdXmPE4{>>%oMb70N=3w0;?Xy?q=oKY#}1 zNu3IHKXX`$Tzc_Fh9TItv6JD%AE`rd66u8kxgXx=1E@fBj10;T?ot8iUH&SVm~Ei_ z2R8P1YhS}ZE}K}7$v;$XalI?a#K-W4XUtrd$h0Y<^5{)<(PxS!sfqn$rU4rB*Uzlg zJ)v=SV7VHozC2FSL1MciJNZaJ1huqOKHZ<_;{$7ji5i2R`;yYtwpm(a)^lyq3@H zj}t4tusDFz7am)Q&I8=GRW3K|7r(dcS$$6mznRJJJO3)Z|Hx|QyHuj@TXXIXiQQM6 zC$VSkvOfH^?c35B6YS7u*H(GHL|r?FbwP4VtwV`EA7Xz=D-WkMaVI>+`}OQexiV}R7UVoKc*FqcbcmV%nQpnyu=!Lpb;#7(hT=?ymV<)_dnGhJW#^Y!i!EpV1-}RuILK*K$4i$+!>}mc30T^A(Hh#|7qS_%GVk-if?P&U za#)VwUZ)YTLp5J9JC0v+FTz`ju`#l7{DQY4KaRHr%e?$D3=4QEWxB0j$jJL>94{f$ zWB6ql7IG|koHnFi*tMR+LLPZm*1F`EVOSVzX#-k#0g{qNoShuJTgOYOU)<~R5ydxh zN$#hlQB97h$;zsrk=}KG5B?dWUuMCA1N^O;Eb zNja?McWowz!BhB8!yJYl#9Kszt0t>4ef7m`jJaiNaoD#9Upj>jNehX@Ui3N4XMtA1 zVYzp-nS6yGT*jl?H(ufi&E;nuI7ZQ3ael$DBo6b{=3|MKE$Db3JGar;H3|D^HvE&P>-!8~3Yp^6E45aXBZI-y#_6DW6HdTkxs9BinG{_J$!(ugUSj`j;3K2+NzPww#eN!-&i_ESE4Earb`7zXlU>@{lM?GUhtC6H^+zk7 z0F$oceZ2anU!BB3J_sF(UmVZ?C#x2*)WpOA&1Rg;fxO+c*ot4jelz`bY|e&)@AmBf zTl*g;Nxw*^vAwqEQ1%=9-c3(;ru;=quM?+6?8bR+Lzw+Hd!Ex8h`{X|!BCYZgR)iHJ3qaoQ?m|Frwd2 zdTLkCkK5cgJ@d}$?-|aXrxU@&1&V<}|1XXs=RhAti6LeH`WT|r>$fpPCW`!#pM}MV z|1Z+{d(V>Y8IPeuqrc{ayw_p8X!RL&Kb+p{o@EC2&7<$Auk+By=)9cQSrJz1mNJ_p z4k8nzKjv0kyP)@w3*VuL<{^qgA4xZD#)&D=w$FLX)-c0ZMaSfrb>1u`sIFtiNf1#f z{$WVlq7t--j*%}Kv)1!_P~oH0@?e|ox3uPr7m?$6BzefHwdP3WFJO!CD2Sy(qRE`` zr)M5nu=Y@AjWC|!eSyQ4w!VJH^$9wLT?T@{OUJ;3&S8cK&&S`x>iKVT4dE$|jit5p zUwG0!IpHbh6UM+aynAjKPab9E@c78^@W2rtd3Dm!V_;Ig4@}Y8y4LR1Gp=aB^=RfH z>Z6zIlG4}E?!}cpj+c75^@lmYyoCTrpMk1%n2>R9-eAH%xgu`6JICOlM5DH)5ko7sf z+S>hvSvXwHc}OLMzWP@y@yOlA9Kb9bf{9g}mQ;ffDnlt{g2wV;P+9vb>QtUzQu0XB zB?W&_Cl+bV(O4><>;$8hq|_kH&k`C>WpIcujcwMPM%Qm$Oj1e+MN#?r#1VYPQdxP` z<)1uMLgTqB2n&QA~{wON}zTW63p64Z=93Narw(<3p(YT|KqPJ49IxlLY$&PHFe zGvGtc)U~q@ZJd?TwrVVaBO=z5a~8kCyleBlZ*5-F={a^rvyG>=Rx0IjwH$@^hLboE zV>7d&Z9J`)pIA+ztiSWXqO*I;*Hk6Pl##<~`A1ruSdA6AC%0}Vk+nr;JnJ(f-@f_J z+OFAEW*3L&+CwslQx$(==gB{?=PBPsKqO7PuV~u$>!X!4N45z&w2^o;`D)|CVVokz zj#t+}<*W}Hyg%#XE90R=e+)h|MrKF(D{p&I?Y79Q51CPqLFs=JbN6*k0aoXcv)URV zVZ1&Es$6r++blW8b3%DOlE_sVe3wJ*1>jAsP-X>@?;X|YHk7+2AA-ucG5XcjFjQoi z`Ge-U9$;N$4Jyyz@{Y&gEqq}4N?RDPwV)DS&`rn>wPKcI2v=K%C32z>y9`0K7_Nwn zv?@PeBQgY4e!7_Ws>Da>8w7UBs}`%y5c%)V zI?RF!E+>QOJ6i)PG82?CH&k+aoar@N2HO-If{NKOW-j29Smncg9V&1Yr))V?Z6CO# zu44q6)S+^2hZdSQ8v_;X#>rc=p;`=AT2Z5QQ6*p1afP0X)3{E;?R8vvh5}AkmZH>G zb*P{qt9+nlY4tQ^d$s~w`+M<7Vy2w-A6Q9Xk9Jq{i{bRu_mGyoN_EJhwlp8UU#^p# zgU0xOd0gKg9oQD1qtSJ)wYR>g2e|zBb_i#oYu14!@=2`uJ9(!2B#|z3tgnh1YbjQX zD{L>^#C-U-JFB#MsRKjItZ$ywmA*ykIj)v}q>+dRh+a1}gWJw@59@LFHk{(67H7nZ zMDfBIJ_nx1$@e*ZSS`Lv91-c^qkY>nQz?&!FkQN&gR^5HqC! diff --git a/.history/eohi1/regression e1 - edu x ehi_20251020100405.r b/.history/eohi1/regression e1 - edu x ehi_20251020100405.r deleted file mode 100644 index ebf62fe..0000000 --- a/.history/eohi1/regression e1 - edu x ehi_20251020100405.r +++ /dev/null @@ -1,10 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_edu) \ No newline at end of file diff --git a/.history/eohi1/regression e1 - edu x ehi_20251020100412.r b/.history/eohi1/regression e1 - edu x ehi_20251020100412.r deleted file mode 100644 index ebf62fe..0000000 --- a/.history/eohi1/regression e1 - edu x ehi_20251020100412.r +++ /dev/null @@ -1,10 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_edu) \ No newline at end of file diff --git a/.history/eohi1/regression e1 - edu x ehi_20251020100550.r b/.history/eohi1/regression e1 - edu x ehi_20251020100550.r deleted file mode 100644 index d2ef1cb..0000000 --- a/.history/eohi1/regression e1 - edu x ehi_20251020100550.r +++ /dev/null @@ -1,11 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_edu) - diff --git a/.history/eohi1/regression e1 - edu x ehi_20251020100952.r b/.history/eohi1/regression e1 - edu x ehi_20251020100952.r deleted file mode 100644 index 1f95024..0000000 --- a/.history/eohi1/regression e1 - edu x ehi_20251020100952.r +++ /dev/null @@ -1,16 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_edu) %>% - mutate(demo_edu = as.factor(demo_edu)) - -# examine data object -str(data) -colSums(is.na(data)) -sapply(data, class) \ No newline at end of file diff --git a/.history/eohi1/regression e1 - edu x ehi_20251020101841.r b/.history/eohi1/regression e1 - edu x ehi_20251020101841.r deleted file mode 100644 index 84762dd..0000000 --- a/.history/eohi1/regression e1 - edu x ehi_20251020101841.r +++ /dev/null @@ -1,35 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_edu) %>% - mutate(demo_edu = as.factor(demo_edu)) - -# examine data object -str(data) -colSums(is.na(data)) -sapply(data, class) -levels(data$demo_edu) - -# Create dummy variables -dummy_vars <- model.matrix(~ demo_edu - 1, data = data) -dummy_df <- as.data.frame(dummy_vars) - -# Rename columns with meaningful names (excluding reference level) -colnames(dummy_df) <- c( - "edu_highschool", # reference level (will be dropped) - "edu_trade", - "edu_college", - "edu_uni_undergrad", - "edu_uni_masters", - "edu_uni_phd", - "edu_prof" -) - -# Add to your data -data <- cbind(data, dummy_df) \ No newline at end of file diff --git a/.history/eohi1/regression e1 - edu x ehi_20251020103253.r b/.history/eohi1/regression e1 - edu x ehi_20251020103253.r deleted file mode 100644 index d7ad239..0000000 --- a/.history/eohi1/regression e1 - edu x ehi_20251020103253.r +++ /dev/null @@ -1,45 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_edu) %>% - mutate(demo_edu = as.factor(demo_edu)) - -# examine data object -str(data) -colSums(is.na(data)) -sapply(data, class) -levels(data$demo_edu) - -data$demo_edu <- factor(data$demo_edu, levels = c( - "High School (or equivalent)", - "Trade School (non-military)", - "College Diploma/Certificate", - "University - Undergraduate", - "University - Graduate (Masters)", - "University - PhD", - "Professional Degree (ex. JD/MD)" -)) - -levels(data$demo_edu) -# Create dummy variables -dummy_vars <- model.matrix(~ demo_edu - 1, data = data) -dummy_df <- as.data.frame(dummy_vars) - -# Rename columns with meaningful names (excluding reference level) -colnames(dummy_df) <- c( - "edu_highschool", # reference level (will be dropped) - "edu_trade", - "edu_college", - "edu_uni_undergrad", - "edu_uni_masters", - "edu_uni_phd", - "edu_prof" -) -# Add to your data -data <- cbind(data, dummy_df) \ No newline at end of file diff --git a/.history/eohi1/regression e1 - edu x ehi_20251020103850.r b/.history/eohi1/regression e1 - edu x ehi_20251020103850.r deleted file mode 100644 index 4fa8314..0000000 --- a/.history/eohi1/regression e1 - edu x ehi_20251020103850.r +++ /dev/null @@ -1,47 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_edu) %>% - mutate(demo_edu = as.factor(demo_edu)) - -# examine data object -str(data) -colSums(is.na(data)) -sapply(data, class) -levels(data$demo_edu) - -data$demo_edu <- factor(data$demo_edu, levels = c( - "High School (or equivalent)", - "Trade School (non-military)", - "College Diploma/Certificate", - "University - Undergraduate", - "University - Graduate (Masters)", - "University - PhD", - "Professional Degree (ex. JD/MD)" -)) - -levels(data$demo_edu) -# Create dummy variables -dummy_vars <- model.matrix(~ demo_edu - 1, data = data) -dummy_df <- as.data.frame(dummy_vars) - -# Rename columns with meaningful names (excluding reference level) -colnames(dummy_df) <- c( - "edu_highschool", # reference level (will be dropped) - "edu_trade", - "edu_college", - "edu_uni_undergrad", - "edu_uni_masters", - "edu_uni_phd", - "edu_prof" -) -# Add to your data -data <- cbind(data, dummy_df) - -data <- data %>% select(-starts_with("edu_highschool")) \ No newline at end of file diff --git a/.history/eohi1/regression e1 - edu x ehi_20251020113946.r b/.history/eohi1/regression e1 - edu x ehi_20251020113946.r deleted file mode 100644 index 39dd98c..0000000 --- a/.history/eohi1/regression e1 - edu x ehi_20251020113946.r +++ /dev/null @@ -1,118 +0,0 @@ -options(scipen = 999) - -library(dplyr) -library(car) -library(lmtest) -library(stargazer) -library(sandwich) -library(lmtest) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_edu) %>% - mutate(demo_edu = as.factor(demo_edu)) - -# examine data object -str(data) -colSums(is.na(data)) -sapply(data, class) -levels(data$demo_edu) - -data$demo_edu <- factor(data$demo_edu, levels = c( - "High School (or equivalent)", - "Trade School (non-military)", - "College Diploma/Certificate", - "University - Undergraduate", - "University - Graduate (Masters)", - "University - PhD", - "Professional Degree (ex. JD/MD)" -)) - -levels(data$demo_edu) -# Create dummy variables -dummy_vars <- model.matrix(~ demo_edu - 1, data = data) -dummy_df <- as.data.frame(dummy_vars) - -# Rename columns with meaningful names (excluding reference level) -colnames(dummy_df) <- c( - "edu_highschool", # reference level (will be dropped) - "edu_trade", - "edu_college", - "edu_uni_undergrad", - "edu_uni_masters", - "edu_uni_phd", - "edu_prof" -) -# Add to your data -data <- cbind(data, dummy_df) - -data <- data %>% select(-starts_with("edu_highschool")) - -#### MODEL 1 - DGEN #### - -model_DGEN <- lm(eohiDGEN_mean ~ edu_trade + edu_college + edu_uni_undergrad + - edu_uni_masters + edu_uni_phd + edu_prof, data = data) - -# Model 1 diagnostics -par(mfrow = c(2, 2)) -plot(model_DGEN, which = 1) # Residuals vs Fitted -plot(model_DGEN, which = 2) # Normal Q-Q, normality -hist(residuals(model_DGEN), main = "Histogram of Residuals", xlab = "Residuals") -shapiro.test(residuals(model_DGEN)) - -plot(model_DGEN, which = 3) # Scale-Location -plot(model_DGEN, which = 4) # Cook's Distance - -# Model 1 specific tests -vif(model_DGEN) # Multicollinearity -dwtest(model_DGEN) # Independence -outlierTest(model_DGEN) # Outliers - -# Look at the specific influential cases -data[c(670, 388, 760), ] - -# 6 outliers: 670, 388, 760, 258, 873, 1030; acknoledge their presence but also they represent ~0.58% of total sample size, which is well below the 5% of outliers that would be considered acceptable. -# heterescedasticity: may be d/t binary vars creating discrete clusters, or d/t real heteroscedasticity. -# normality violated but sample size is robust to violation -# no multicollinearity -# no autocorrelation (samples are independent) - -#results -summary(model_DGEN) - -# Create a nice formatted table -stargazer(model_DGEN, type = "text", - title = "Regression Results: Education and EOHI-DGEN", - dep.var.labels = "EOHI-DGEN Mean", - covariate.labels = c("Trade School", "College", "University Undergrad", - "University Masters", "University PhD", "Professional Degree"), - report = "vcsp") # This shows coefficients, SEs, and p-values - -#### MODEL 2 - DOMAIN #### - -model_domain <- lm(ehi_global_mean ~ edu_trade + edu_college + edu_uni_undergrad + - edu_uni_masters + edu_uni_phd + edu_prof, data = data) - -# ASSUMPTION CHECKING FOR MODEL 2 (model_domain) -par(mfrow = c(2, 2)) -plot(model_domain, which = 1) # Residuals vs Fitted - -plot(model_domain, which = 2) # Normal Q-Q, normality -hist(residuals(model_domain), main = "Histogram of Residuals", xlab = "Residuals") -shapiro.test(residuals(model_domain)) - -plot(model_domain, which = 3) # Scale-Location -plot(model_domain, which = 4) # Cook's Distance - -# Model 2 specific tests -vif(model_domain) # Multicollinearity -dwtest(model_domain) # Independence -outlierTest(model_domain) # Outliers - -# Reset plotting -par(mfrow = c(1, 1)) - -summary(model_domain) \ No newline at end of file diff --git a/.history/eohi1/regression e1 - edu x ehi_20251020133831.r b/.history/eohi1/regression e1 - edu x ehi_20251020133831.r deleted file mode 100644 index fbfc999..0000000 --- a/.history/eohi1/regression e1 - edu x ehi_20251020133831.r +++ /dev/null @@ -1,159 +0,0 @@ -options(scipen = 999) - -library(dplyr) -library(car) -library(lmtest) -library(stargazer) -library(sandwich) -library(lmtest) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_edu) %>% - mutate(demo_edu = as.factor(demo_edu)) - -# examine data object -str(data) -colSums(is.na(data)) -sapply(data, class) -levels(data$demo_edu) - -data$demo_edu <- factor(data$demo_edu, levels = c( - "High School (or equivalent)", - "Trade School (non-military)", - "College Diploma/Certificate", - "University - Undergraduate", - "University - Graduate (Masters)", - "University - PhD", - "Professional Degree (ex. JD/MD)" -)) - -levels(data$demo_edu) -# Create dummy variables -dummy_vars <- model.matrix(~ demo_edu - 1, data = data) -dummy_df <- as.data.frame(dummy_vars) - -# Rename columns with meaningful names (excluding reference level) -colnames(dummy_df) <- c( - "edu_highschool", # reference level (will be dropped) - "edu_trade", - "edu_college", - "edu_uni_undergrad", - "edu_uni_masters", - "edu_uni_phd", - "edu_prof" -) -# Add to your data -data <- cbind(data, dummy_df) - -data <- data %>% select(-starts_with("edu_highschool")) - -#### MODEL 1 - DGEN #### - -model_DGEN <- lm(eohiDGEN_mean ~ edu_trade + edu_college + edu_uni_undergrad + - edu_uni_masters + edu_uni_phd + edu_prof, data = data) - -# Model 1 diagnostics -par(mfrow = c(2, 2)) -plot(model_DGEN, which = 1) # Residuals vs Fitted -plot(model_DGEN, which = 2) # Normal Q-Q, normality -hist(residuals(model_DGEN), main = "Histogram of Residuals", xlab = "Residuals") -shapiro.test(residuals(model_DGEN)) - -plot(model_DGEN, which = 3) # Scale-Location -plot(model_DGEN, which = 4) # Cook's Distance - -# Model 1 specific tests -vif(model_DGEN) # Multicollinearity -dwtest(model_DGEN) # Independence -outlierTest(model_DGEN) # Outliers - -# Look at the specific influential cases -data[c(670, 388, 760), ] - -# 6 outliers: 670, 388, 760, 258, 873, 1030; acknoledge their presence but also they represent ~0.58% of total sample size, which is well below the 5% of outliers that would be considered acceptable. -# heterescedasticity: may be d/t binary vars creating discrete clusters, or d/t real heteroscedasticity. -# normality violated but sample size is robust to violation -# no multicollinearity -# no autocorrelation (samples are independent) - -#results -summary(model_DGEN) - -# Create a nice formatted table -stargazer(model_DGEN, type = "text", - title = "Regression Results: Education and EOHI-DGEN", - dep.var.labels = "EOHI-DGEN Mean", - covariate.labels = c("Trade School", "College", "University Undergrad", - "University Masters", "University PhD", "Professional Degree"), - report = "vcsp"), - add.lines = list(c("AIC", round(AIC(model_DGEN), 2)))) - -# Use robust standard errors (doesn't change coefficients, just SEs) -modelDGEN_robust <- coeftest(model_DGEN, vcov = vcovHC(model_DGEN, type = "HC3")) - -stargazer(modelDGEN_robust, type = "text", - title = "Regression Results: Education and EOHI-DGEN", - dep.var.labels = "EOHI-DGEN Mean", - covariate.labels = c("Trade School", "College", "University Undergrad", - "University Masters", "University PhD", "Professional Degree"), - report = "vcsp") - - -#### MODEL 2 - DOMAIN #### - -model_domain <- lm(ehi_global_mean ~ edu_trade + edu_college + edu_uni_undergrad + - edu_uni_masters + edu_uni_phd + edu_prof, data = data) - -# ASSUMPTION CHECKING FOR MODEL 2 (model_domain) -plot(model_domain, which = 1) # Residuals vs Fitted - -plot(model_domain, which = 2) # Normal Q-Q, normality -hist(residuals(model_domain), main = "Histogram of Residuals", xlab = "Residuals") -shapiro.test(residuals(model_domain)) - -plot(model_domain, which = 3) # Scale-Location -plot(model_domain, which = 4) # Cook's Distance - -# Model 2 specific tests -vif(model_domain) # Multicollinearity -dwtest(model_domain) # Independence -outlierTest(model_domain) # Outliers - -# Check if the autocorrelation is real or artifactual -# Plot residuals against observation order -plot(residuals(model_domain), type = "l") -abline(h = 0, col = "red") - - -# 6 outliers: acknoledge their presence but also they represent ~0.58% of total sample size, which is well below the 5% of outliers that would be considered acceptable. -# heterescedasticity: may be d/t binary vars creating discrete clusters, or d/t real heteroscedasticity. -# normality violated but sample size is robust to violation -# no multicollinearity -# auto correlation is significant, may be due to aggregated measure of multiple repeated measures - -# Reset plotting to 1x1 -# par(mfrow = c(1, 1)) - -summary(model_domain) - -stargazer(model_domain, type = "text", - title = "Regression Results: Education and EOHI-DGEN", - dep.var.labels = "EHI Domain Mean", - covariate.labels = c("Trade School", "College", "University Undergrad", - "University Masters", "University PhD", "Professional Degree"), - report = "vcsp"), # This shows coefficients, SEs, and p-values - add.lines = list(c("AIC", round(AIC(model_DGEN), 2), round(AIC(model_domain), 2))) - -# Use robust standard errors (doesn't change coefficients, just SEs) -modelDOMAIN_robust <- coeftest(model_domain, vcov = vcovHC(model_domain, type = "HC3")) - -stargazer(modelDOMAIN_robust, type = "text", - title = "Regression Results: Education and EOHI-DGEN", - dep.var.labels = "EHI Domain Mean", - covariate.labels = c("Trade School", "College", "University Undergrad", - "University Masters", "University PhD", "Professional Degree"), - report = "vcsp") diff --git a/.history/eohi1/regression e1 - edu x ehi_20251020134231.r b/.history/eohi1/regression e1 - edu x ehi_20251020134231.r deleted file mode 100644 index 35fb5df..0000000 --- a/.history/eohi1/regression e1 - edu x ehi_20251020134231.r +++ /dev/null @@ -1,159 +0,0 @@ -options(scipen = 999) - -library(dplyr) -library(car) -library(lmtest) -library(stargazer) -library(sandwich) -library(lmtest) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_edu) %>% - mutate(demo_edu = as.factor(demo_edu)) - -# examine data object -str(data) -colSums(is.na(data)) -sapply(data, class) -levels(data$demo_edu) - -data$demo_edu <- factor(data$demo_edu, levels = c( - "High School (or equivalent)", - "Trade School (non-military)", - "College Diploma/Certificate", - "University - Undergraduate", - "University - Graduate (Masters)", - "University - PhD", - "Professional Degree (ex. JD/MD)" -)) - -levels(data$demo_edu) -# Create dummy variables -dummy_vars <- model.matrix(~ demo_edu - 1, data = data) -dummy_df <- as.data.frame(dummy_vars) - -# Rename columns with meaningful names (excluding reference level) -colnames(dummy_df) <- c( - "edu_highschool", # reference level (will be dropped) - "edu_trade", - "edu_college", - "edu_uni_undergrad", - "edu_uni_masters", - "edu_uni_phd", - "edu_prof" -) -# Add to your data -data <- cbind(data, dummy_df) - -data <- data %>% select(-starts_with("edu_highschool")) - -#### MODEL 1 - DGEN #### - -model_DGEN <- lm(eohiDGEN_mean ~ edu_trade + edu_college + edu_uni_undergrad + - edu_uni_masters + edu_uni_phd + edu_prof, data = data) - -# Model 1 diagnostics -par(mfrow = c(2, 2)) -plot(model_DGEN, which = 1) # Residuals vs Fitted -plot(model_DGEN, which = 2) # Normal Q-Q, normality -hist(residuals(model_DGEN), main = "Histogram of Residuals", xlab = "Residuals") -shapiro.test(residuals(model_DGEN)) - -plot(model_DGEN, which = 3) # Scale-Location -plot(model_DGEN, which = 4) # Cook's Distance - -# Model 1 specific tests -vif(model_DGEN) # Multicollinearity -dwtest(model_DGEN) # Independence -outlierTest(model_DGEN) # Outliers - -# Look at the specific influential cases -data[c(670, 388, 760), ] - -# 6 outliers: 670, 388, 760, 258, 873, 1030; acknoledge their presence but also they represent ~0.58% of total sample size, which is well below the 5% of outliers that would be considered acceptable. -# heterescedasticity: may be d/t binary vars creating discrete clusters, or d/t real heteroscedasticity. -# normality violated but sample size is robust to violation -# no multicollinearity -# no autocorrelation (samples are independent) - -#results -summary(model_DGEN) - -# Create a nice formatted table -stargazer(model_DGEN, type = "text", - title = "Regression Results: Education and EOHI-DGEN", - dep.var.labels = "EOHI-DGEN Mean", - covariate.labels = c("Trade School", "College", "University Undergrad", - "University Masters", "University PhD", "Professional Degree"), - report = "vcsp", - add.lines = list(c("AIC", round(AIC(model_DGEN), 2)))) - -# Use robust standard errors (doesn't change coefficients, just SEs) -modelDGEN_robust <- coeftest(model_DGEN, vcov = vcovHC(model_DGEN, type = "HC3")) - -stargazer(modelDGEN_robust, type = "text", - title = "Regression Results: Education and EOHI-DGEN", - dep.var.labels = "EOHI-DGEN Mean", - covariate.labels = c("Trade School", "College", "University Undergrad", - "University Masters", "University PhD", "Professional Degree"), - report = "vcsp") - - -#### MODEL 2 - DOMAIN #### - -model_domain <- lm(ehi_global_mean ~ edu_trade + edu_college + edu_uni_undergrad + - edu_uni_masters + edu_uni_phd + edu_prof, data = data) - -# ASSUMPTION CHECKING FOR MODEL 2 (model_domain) -plot(model_domain, which = 1) # Residuals vs Fitted - -plot(model_domain, which = 2) # Normal Q-Q, normality -hist(residuals(model_domain), main = "Histogram of Residuals", xlab = "Residuals") -shapiro.test(residuals(model_domain)) - -plot(model_domain, which = 3) # Scale-Location -plot(model_domain, which = 4) # Cook's Distance - -# Model 2 specific tests -vif(model_domain) # Multicollinearity -dwtest(model_domain) # Independence -outlierTest(model_domain) # Outliers - -# Check if the autocorrelation is real or artifactual -# Plot residuals against observation order -plot(residuals(model_domain), type = "l") -abline(h = 0, col = "red") - - -# 6 outliers: acknoledge their presence but also they represent ~0.58% of total sample size, which is well below the 5% of outliers that would be considered acceptable. -# heterescedasticity: may be d/t binary vars creating discrete clusters, or d/t real heteroscedasticity. -# normality violated but sample size is robust to violation -# no multicollinearity -# auto correlation is significant, may be due to aggregated measure of multiple repeated measures - -# Reset plotting to 1x1 -# par(mfrow = c(1, 1)) - -summary(model_domain) - -stargazer(model_domain, type = "text", - title = "Regression Results: Education and EOHI-DGEN", - dep.var.labels = "EHI Domain Mean", - covariate.labels = c("Trade School", "College", "University Undergrad", - "University Masters", "University PhD", "Professional Degree"), - report = "vcsp", # This shows coefficients, SEs, and p-values - add.lines = list(c("AIC", round(AIC(model_DGEN), 2), round(AIC(model_domain), 2)))) - -# Use robust standard errors (doesn't change coefficients, just SEs) -modelDOMAIN_robust <- coeftest(model_domain, vcov = vcovHC(model_domain, type = "HC3")) - -stargazer(modelDOMAIN_robust, type = "text", - title = "Regression Results: Education and EOHI-DGEN", - dep.var.labels = "EHI Domain Mean", - covariate.labels = c("Trade School", "College", "University Undergrad", - "University Masters", "University PhD", "Professional Degree"), - report = "vcsp") diff --git a/.history/eohi1/regression e1 - edu x ehi_20251020173226.r b/.history/eohi1/regression e1 - edu x ehi_20251020173226.r deleted file mode 100644 index 1df9492..0000000 --- a/.history/eohi1/regression e1 - edu x ehi_20251020173226.r +++ /dev/null @@ -1,161 +0,0 @@ -options(scipen = 999) - -library(dplyr) -library(car) -library(lmtest) -library(stargazer) -library(sandwich) -library(lmtest) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_edu) %>% - mutate(demo_edu = as.factor(demo_edu)) - -# examine data object -str(data) -colSums(is.na(data)) -sapply(data, class) -levels(data$demo_edu) - -data$demo_edu <- factor(data$demo_edu, levels = c( - "High School (or equivalent)", - "Trade School (non-military)", - "College Diploma/Certificate", - "University - Undergraduate", - "University - Graduate (Masters)", - "University - PhD", - "Professional Degree (ex. JD/MD)" -)) - -levels(data$demo_edu) -# Create dummy variables -dummy_vars <- model.matrix(~ demo_edu - 1, data = data) -dummy_df <- as.data.frame(dummy_vars) - -# Rename columns with meaningful names (excluding reference level) -colnames(dummy_df) <- c( - "edu_highschool", # reference level (will be dropped) - "edu_trade", - "edu_college", - "edu_uni_undergrad", - "edu_uni_masters", - "edu_uni_phd", - "edu_prof" -) -# Add to your data -data <- cbind(data, dummy_df) - -data <- data %>% select(-starts_with("edu_highschool")) - -#### MODEL 1 - DGEN #### - -model_DGEN <- lm(eohiDGEN_mean ~ edu_trade + edu_college + edu_uni_undergrad + - edu_uni_masters + edu_uni_phd + edu_prof, data = data) - -# Model 1 diagnostics -par(mfrow = c(2, 2)) -plot(model_DGEN, which = 1) # Residuals vs Fitted -plot(model_DGEN, which = 2) # Normal Q-Q, normality -hist(residuals(model_DGEN), main = "Histogram of Residuals", xlab = "Residuals") -shapiro.test(residuals(model_DGEN)) - -plot(model_DGEN, which = 3) # Scale-Location -plot(model_DGEN, which = 4) # Cook's Distance - -# Model 1 specific tests -vif(model_DGEN) # Multicollinearity -dwtest(model_DGEN) # Independence -outlierTest(model_DGEN) # Outliers - -# Look at the specific influential cases -data[c(670, 388, 760), ] - -# 6 outliers: 670, 388, 760, 258, 873, 1030; acknoledge their presence but also they represent ~0.58% of total sample size, which is well below the 5% of outliers that would be considered acceptable. -# heterescedasticity: may be d/t binary vars creating discrete clusters, or d/t real heteroscedasticity. -# normality violated but sample size is robust to violation -# no multicollinearity -# no autocorrelation (samples are independent) - -#results -print(summary(model_DGEN)) -print(AIC(model_DGEN)) - -# Create a nice formatted table -stargazer(model_DGEN, type = "text", - title = "Regression Results: Education and EOHI-DGEN", - dep.var.labels = "EOHI-DGEN Mean", - covariate.labels = c("Trade School", "College", "University Undergrad", - "University Masters", "University PhD", "Professional Degree"), - report = "vcsp", - add.lines = list(c("AIC", round(AIC(model_DGEN), 2)))) - -# Use robust standard errors (doesn't change coefficients, just SEs) -modelDGEN_robust <- coeftest(model_DGEN, vcov = vcovHC(model_DGEN, type = "HC3")) - -stargazer(modelDGEN_robust, type = "text", - title = "Regression Results: Education and EOHI-DGEN", - dep.var.labels = "EOHI-DGEN Mean", - covariate.labels = c("Trade School", "College", "University Undergrad", - "University Masters", "University PhD", "Professional Degree"), - report = "vcsp") - - -#### MODEL 2 - DOMAIN #### - -model_domain <- lm(ehi_global_mean ~ edu_trade + edu_college + edu_uni_undergrad + - edu_uni_masters + edu_uni_phd + edu_prof, data = data) - -# ASSUMPTION CHECKING FOR MODEL 2 (model_domain) -plot(model_domain, which = 1) # Residuals vs Fitted - -plot(model_domain, which = 2) # Normal Q-Q, normality -hist(residuals(model_domain), main = "Histogram of Residuals", xlab = "Residuals") -shapiro.test(residuals(model_domain)) - -plot(model_domain, which = 3) # Scale-Location -plot(model_domain, which = 4) # Cook's Distance - -# Model 2 specific tests -vif(model_domain) # Multicollinearity -dwtest(model_domain) # Independence -outlierTest(model_domain) # Outliers - -# Check if the autocorrelation is real or artifactual -# Plot residuals against observation order -plot(residuals(model_domain), type = "l") -abline(h = 0, col = "red") - - -# 6 outliers: acknoledge their presence but also they represent ~0.58% of total sample size, which is well below the 5% of outliers that would be considered acceptable. -# heterescedasticity: may be d/t binary vars creating discrete clusters, or d/t real heteroscedasticity. -# normality violated but sample size is robust to violation -# no multicollinearity -# auto correlation is significant, may be due to aggregated measure of multiple repeated measures - -# Reset plotting to 1x1 -# par(mfrow = c(1, 1)) - -print(summary(model_domain)) -print(AIC(model_domain)) - -stargazer(model_domain, type = "text", - title = "Regression Results: Education and EHI Domain", - dep.var.labels = "EHI Domain Mean", - covariate.labels = c("Trade School", "College", "University Undergrad", - "University Masters", "University PhD", "Professional Degree"), - report = "vcsp", # This shows coefficients, SEs, and p-values - add.lines = list(c("AIC", round(AIC(model_domain), 2)))) - -# Use robust standard errors (doesn't change coefficients, just SEs) -modelDOMAIN_robust <- coeftest(model_domain, vcov = vcovHC(model_domain, type = "HC3")) - -stargazer(modelDOMAIN_robust, type = "text", - title = "Regression Results: Education and EHI Domain", - dep.var.labels = "EHI Domain Mean", - covariate.labels = c("Trade School", "College", "University Undergrad", - "University Masters", "University PhD", "Professional Degree"), - report = "vcsp") diff --git a/.history/eohi1/regression e1 - edu x ehi_20251020173252.r b/.history/eohi1/regression e1 - edu x ehi_20251020173252.r deleted file mode 100644 index 1df9492..0000000 --- a/.history/eohi1/regression e1 - edu x ehi_20251020173252.r +++ /dev/null @@ -1,161 +0,0 @@ -options(scipen = 999) - -library(dplyr) -library(car) -library(lmtest) -library(stargazer) -library(sandwich) -library(lmtest) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_edu) %>% - mutate(demo_edu = as.factor(demo_edu)) - -# examine data object -str(data) -colSums(is.na(data)) -sapply(data, class) -levels(data$demo_edu) - -data$demo_edu <- factor(data$demo_edu, levels = c( - "High School (or equivalent)", - "Trade School (non-military)", - "College Diploma/Certificate", - "University - Undergraduate", - "University - Graduate (Masters)", - "University - PhD", - "Professional Degree (ex. JD/MD)" -)) - -levels(data$demo_edu) -# Create dummy variables -dummy_vars <- model.matrix(~ demo_edu - 1, data = data) -dummy_df <- as.data.frame(dummy_vars) - -# Rename columns with meaningful names (excluding reference level) -colnames(dummy_df) <- c( - "edu_highschool", # reference level (will be dropped) - "edu_trade", - "edu_college", - "edu_uni_undergrad", - "edu_uni_masters", - "edu_uni_phd", - "edu_prof" -) -# Add to your data -data <- cbind(data, dummy_df) - -data <- data %>% select(-starts_with("edu_highschool")) - -#### MODEL 1 - DGEN #### - -model_DGEN <- lm(eohiDGEN_mean ~ edu_trade + edu_college + edu_uni_undergrad + - edu_uni_masters + edu_uni_phd + edu_prof, data = data) - -# Model 1 diagnostics -par(mfrow = c(2, 2)) -plot(model_DGEN, which = 1) # Residuals vs Fitted -plot(model_DGEN, which = 2) # Normal Q-Q, normality -hist(residuals(model_DGEN), main = "Histogram of Residuals", xlab = "Residuals") -shapiro.test(residuals(model_DGEN)) - -plot(model_DGEN, which = 3) # Scale-Location -plot(model_DGEN, which = 4) # Cook's Distance - -# Model 1 specific tests -vif(model_DGEN) # Multicollinearity -dwtest(model_DGEN) # Independence -outlierTest(model_DGEN) # Outliers - -# Look at the specific influential cases -data[c(670, 388, 760), ] - -# 6 outliers: 670, 388, 760, 258, 873, 1030; acknoledge their presence but also they represent ~0.58% of total sample size, which is well below the 5% of outliers that would be considered acceptable. -# heterescedasticity: may be d/t binary vars creating discrete clusters, or d/t real heteroscedasticity. -# normality violated but sample size is robust to violation -# no multicollinearity -# no autocorrelation (samples are independent) - -#results -print(summary(model_DGEN)) -print(AIC(model_DGEN)) - -# Create a nice formatted table -stargazer(model_DGEN, type = "text", - title = "Regression Results: Education and EOHI-DGEN", - dep.var.labels = "EOHI-DGEN Mean", - covariate.labels = c("Trade School", "College", "University Undergrad", - "University Masters", "University PhD", "Professional Degree"), - report = "vcsp", - add.lines = list(c("AIC", round(AIC(model_DGEN), 2)))) - -# Use robust standard errors (doesn't change coefficients, just SEs) -modelDGEN_robust <- coeftest(model_DGEN, vcov = vcovHC(model_DGEN, type = "HC3")) - -stargazer(modelDGEN_robust, type = "text", - title = "Regression Results: Education and EOHI-DGEN", - dep.var.labels = "EOHI-DGEN Mean", - covariate.labels = c("Trade School", "College", "University Undergrad", - "University Masters", "University PhD", "Professional Degree"), - report = "vcsp") - - -#### MODEL 2 - DOMAIN #### - -model_domain <- lm(ehi_global_mean ~ edu_trade + edu_college + edu_uni_undergrad + - edu_uni_masters + edu_uni_phd + edu_prof, data = data) - -# ASSUMPTION CHECKING FOR MODEL 2 (model_domain) -plot(model_domain, which = 1) # Residuals vs Fitted - -plot(model_domain, which = 2) # Normal Q-Q, normality -hist(residuals(model_domain), main = "Histogram of Residuals", xlab = "Residuals") -shapiro.test(residuals(model_domain)) - -plot(model_domain, which = 3) # Scale-Location -plot(model_domain, which = 4) # Cook's Distance - -# Model 2 specific tests -vif(model_domain) # Multicollinearity -dwtest(model_domain) # Independence -outlierTest(model_domain) # Outliers - -# Check if the autocorrelation is real or artifactual -# Plot residuals against observation order -plot(residuals(model_domain), type = "l") -abline(h = 0, col = "red") - - -# 6 outliers: acknoledge their presence but also they represent ~0.58% of total sample size, which is well below the 5% of outliers that would be considered acceptable. -# heterescedasticity: may be d/t binary vars creating discrete clusters, or d/t real heteroscedasticity. -# normality violated but sample size is robust to violation -# no multicollinearity -# auto correlation is significant, may be due to aggregated measure of multiple repeated measures - -# Reset plotting to 1x1 -# par(mfrow = c(1, 1)) - -print(summary(model_domain)) -print(AIC(model_domain)) - -stargazer(model_domain, type = "text", - title = "Regression Results: Education and EHI Domain", - dep.var.labels = "EHI Domain Mean", - covariate.labels = c("Trade School", "College", "University Undergrad", - "University Masters", "University PhD", "Professional Degree"), - report = "vcsp", # This shows coefficients, SEs, and p-values - add.lines = list(c("AIC", round(AIC(model_domain), 2)))) - -# Use robust standard errors (doesn't change coefficients, just SEs) -modelDOMAIN_robust <- coeftest(model_domain, vcov = vcovHC(model_domain, type = "HC3")) - -stargazer(modelDOMAIN_robust, type = "text", - title = "Regression Results: Education and EHI Domain", - dep.var.labels = "EHI Domain Mean", - covariate.labels = c("Trade School", "College", "University Undergrad", - "University Masters", "University PhD", "Professional Degree"), - report = "vcsp") diff --git a/.history/eohi1/regression e1 - edu x ehi_20251023140538.r b/.history/eohi1/regression e1 - edu x ehi_20251023140538.r deleted file mode 100644 index bcd295f..0000000 --- a/.history/eohi1/regression e1 - edu x ehi_20251023140538.r +++ /dev/null @@ -1,232 +0,0 @@ -options(scipen = 999) - -library(dplyr) -library(car) -library(lmtest) -library(stargazer) -library(sandwich) -library(lmtest) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_edu) %>% - mutate(demo_edu = as.factor(demo_edu)) - -# examine data object -str(data) -colSums(is.na(data)) -sapply(data, class) -levels(data$demo_edu) - -data$demo_edu <- factor(data$demo_edu, levels = c( - "High School (or equivalent)", - "Trade School (non-military)", - "College Diploma/Certificate", - "University - Undergraduate", - "University - Graduate (Masters)", - "University - PhD", - "Professional Degree (ex. JD/MD)" -)) - -levels(data$demo_edu) -# Create dummy variables -dummy_vars <- model.matrix(~ demo_edu - 1, data = data) -dummy_df <- as.data.frame(dummy_vars) - -# Rename columns with meaningful names (excluding reference level) -colnames(dummy_df) <- c( - "edu_highschool", # reference level (will be dropped) - "edu_trade", - "edu_college", - "edu_uni_undergrad", - "edu_uni_masters", - "edu_uni_phd", - "edu_prof" -) -# Add to your data -data <- cbind(data, dummy_df) - -data <- data %>% select(-starts_with("edu_highschool")) - -#### MODEL 1 - DGEN #### - -model_DGEN <- lm(eohiDGEN_mean ~ edu_trade + edu_college + edu_uni_undergrad + - edu_uni_masters + edu_uni_phd + edu_prof, data = data) - -# Model 1 diagnostics -par(mfrow = c(2, 2)) -plot(model_DGEN, which = 1) # Residuals vs Fitted -plot(model_DGEN, which = 2) # Normal Q-Q, normality -hist(residuals(model_DGEN), main = "Histogram of Residuals", xlab = "Residuals") -shapiro.test(residuals(model_DGEN)) - -plot(model_DGEN, which = 3) # Scale-Location -plot(model_DGEN, which = 4) # Cook's Distance - -# Model 1 specific tests -vif(model_DGEN) # Multicollinearity -dwtest(model_DGEN) # Independence -outlierTest(model_DGEN) # Outliers - -# Look at the specific influential cases -data[c(670, 388, 760), ] - -# 6 outliers: 670, 388, 760, 258, 873, 1030; acknoledge their presence but also they represent ~0.58% of total sample size, which is well below the 5% of outliers that would be considered acceptable. -# heterescedasticity: may be d/t binary vars creating discrete clusters, or d/t real heteroscedasticity. -# normality violated but sample size is robust to violation -# no multicollinearity -# no autocorrelation (samples are independent) - -#results -print(summary(model_DGEN)) -print(AIC(model_DGEN)) - -# Create a nice formatted table -stargazer(model_DGEN, type = "text", - title = "Regression Results: Education and EOHI-DGEN", - dep.var.labels = "EOHI-DGEN Mean", - covariate.labels = c("Trade School", "College", "University Undergrad", - "University Masters", "University PhD", "Professional Degree"), - report = "vcsp", - add.lines = list(c("AIC", round(AIC(model_DGEN), 2)))) - -# Use robust standard errors (doesn't change coefficients, just SEs) -modelDGEN_robust <- coeftest(model_DGEN, vcov = vcovHC(model_DGEN, type = "HC3")) - -stargazer(modelDGEN_robust, type = "text", - title = "Regression Results: Education and EOHI-DGEN", - dep.var.labels = "EOHI-DGEN Mean", - covariate.labels = c("Trade School", "College", "University Undergrad", - "University Masters", "University PhD", "Professional Degree"), - report = "vcsp") - - -#### MODEL 2 - DOMAIN #### - -model_domain <- lm(ehi_global_mean ~ edu_trade + edu_college + edu_uni_undergrad + - edu_uni_masters + edu_uni_phd + edu_prof, data = data) - -# ASSUMPTION CHECKING FOR MODEL 2 (model_domain) -plot(model_domain, which = 1) # Residuals vs Fitted - -plot(model_domain, which = 2) # Normal Q-Q, normality -hist(residuals(model_domain), main = "Histogram of Residuals", xlab = "Residuals") -shapiro.test(residuals(model_domain)) - -plot(model_domain, which = 3) # Scale-Location -plot(model_domain, which = 4) # Cook's Distance - -# Model 2 specific tests -vif(model_domain) # Multicollinearity -dwtest(model_domain) # Independence -outlierTest(model_domain) # Outliers - -# Check if the autocorrelation is real or artifactual -# Plot residuals against observation order -plot(residuals(model_domain), type = "l") -abline(h = 0, col = "red") - - -# 6 outliers: acknoledge their presence but also they represent ~0.58% of total sample size, which is well below the 5% of outliers that would be considered acceptable. -# heterescedasticity: may be d/t binary vars creating discrete clusters, or d/t real heteroscedasticity. -# normality violated but sample size is robust to violation -# no multicollinearity -# auto correlation is significant, may be due to aggregated measure of multiple repeated measures - -# Reset plotting to 1x1 -# par(mfrow = c(1, 1)) - -print(summary(model_domain)) -print(AIC(model_domain)) - -stargazer(model_domain, type = "text", - title = "Regression Results: Education and EHI Domain", - dep.var.labels = "EHI Domain Mean", - covariate.labels = c("Trade School", "College", "University Undergrad", - "University Masters", "University PhD", "Professional Degree"), - report = "vcsp", # This shows coefficients, SEs, and p-values - add.lines = list(c("AIC", round(AIC(model_domain), 2)))) - -# Use robust standard errors (doesn't change coefficients, just SEs) -modelDOMAIN_robust <- coeftest(model_domain, vcov = vcovHC(model_domain, type = "HC3")) - -stargazer(modelDOMAIN_robust, type = "text", - title = "Regression Results: Education and EHI Domain", - dep.var.labels = "EHI Domain Mean", - covariate.labels = c("Trade School", "College", "University Undergrad", - "University Masters", "University PhD", "Professional Degree"), - report = "vcsp") - -#### PLOTS #### - -library(ggplot2) -library(dplyr) - -# Calculate means and confidence intervals for EOHI-DGEN -edu_summary_DGEN <- data %>% - group_by(demo_edu) %>% - summarise( - mean_DGEN = mean(eohiDGEN_mean, na.rm = TRUE), - n = n(), - se_DGEN = sd(eohiDGEN_mean, na.rm = TRUE) / sqrt(n()), - ci_lower_DGEN = mean_DGEN - 1.96 * se_DGEN, - ci_upper_DGEN = mean_DGEN + 1.96 * se_DGEN - ) - -# Calculate means and confidence intervals for EHI Domain -edu_summary_domain <- data %>% - group_by(demo_edu) %>% - summarise( - mean_domain = mean(ehi_global_mean, na.rm = TRUE), - n = n(), - se_domain = sd(ehi_global_mean, na.rm = TRUE) / sqrt(n()), - ci_lower_domain = mean_domain - 1.96 * se_domain, - ci_upper_domain = mean_domain + 1.96 * se_domain - ) - -# Plot 1: EOHI-DGEN means with confidence intervals -p1 <- ggplot(edu_summary_DGEN, aes(x = demo_edu, y = mean_DGEN)) + - geom_point(size = 3, color = "steelblue") + - geom_errorbar(aes(ymin = ci_lower_DGEN, ymax = ci_upper_DGEN), - width = 0.1, color = "steelblue", linewidth = 1) + - labs( - title = "Mean EOHI-DGEN by Education Level", - subtitle = "Error bars show 95% confidence intervals", - x = "Education Level", - y = "Mean EOHI-DGEN Score" - ) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 45, hjust = 1), - plot.title = element_text(size = 14, face = "bold"), - plot.subtitle = element_text(size = 10, color = "gray60") - ) - -# Plot 2: EHI Domain means with confidence intervals -p2 <- ggplot(edu_summary_domain, aes(x = demo_edu, y = mean_domain)) + - geom_point(size = 3, color = "darkgreen") + - geom_errorbar(aes(ymin = ci_lower_domain, ymax = ci_upper_domain), - width = 0.1, color = "darkgreen", linewidth = 1) + - labs( - title = "Mean EHI Domain by Education Level", - subtitle = "Error bars show 95% confidence intervals", - x = "Education Level", - y = "Mean EHI Domain Score" - ) + - theme_minimal() + - theme( - axis.text.x = element_text(angle = 45, hjust = 1), - plot.title = element_text(size = 14, face = "bold"), - plot.subtitle = element_text(size = 10, color = "gray60") - ) - -# Display the plots -print(p1) -print(p2) - -# Save the plots -ggsave("education_DGEN_means.png", p1, width = 10, height = 6, dpi = 300) -ggsave("education_domain_means.png", p2, width = 10, height = 6, dpi = 300) \ No newline at end of file diff --git a/.history/eohi1/regression e1 - ehi x sex x age_20251020173352.r b/.history/eohi1/regression e1 - ehi x sex x age_20251020173352.r deleted file mode 100644 index 8b82aee..0000000 --- a/.history/eohi1/regression e1 - ehi x sex x age_20251020173352.r +++ /dev/null @@ -1,15 +0,0 @@ -options(scipen = 999) - -library(dplyr) -library(car) -library(lmtest) -library(stargazer) -library(sandwich) -library(lmtest) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_edu) \ No newline at end of file diff --git a/.history/eohi1/regression e1 - ehi x sex x age_20251020173438.r b/.history/eohi1/regression e1 - ehi x sex x age_20251020173438.r deleted file mode 100644 index 998a8f6..0000000 --- a/.history/eohi1/regression e1 - ehi x sex x age_20251020173438.r +++ /dev/null @@ -1,16 +0,0 @@ -options(scipen = 999) - -library(dplyr) -library(car) -library(lmtest) -library(stargazer) -library(sandwich) -library(lmtest) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1) - diff --git a/.history/eohi1/regression e1 - ehi x sex x age_20251020174241.r b/.history/eohi1/regression e1 - ehi x sex x age_20251020174241.r deleted file mode 100644 index b93f9b5..0000000 --- a/.history/eohi1/regression e1 - ehi x sex x age_20251020174241.r +++ /dev/null @@ -1,26 +0,0 @@ -options(scipen = 999) - -library(dplyr) -library(car) -library(lmtest) -library(stargazer) -library(sandwich) -library(lmtest) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1) %>% - filter(demo_sex != "Prefer not to say") - -str(data) -colSums(is.na(data)) -sapply(data, class) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) diff --git a/.history/eohi1/regression e1 - ehi x sex x age_20251020174522.r b/.history/eohi1/regression e1 - ehi x sex x age_20251020174522.r deleted file mode 100644 index 1555c12..0000000 --- a/.history/eohi1/regression e1 - ehi x sex x age_20251020174522.r +++ /dev/null @@ -1,39 +0,0 @@ -options(scipen = 999) - -library(dplyr) -library(car) -library(lmtest) -library(stargazer) -library(sandwich) -library(lmtest) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1) %>% - filter(demo_sex != "Prefer not to say") - -str(data) -colSums(is.na(data)) -sapply(data, class) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -sd(data$demo_age_1, na.rm = TRUE) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) \ No newline at end of file diff --git a/.history/eohi1/regression e1 - ehi x sex x age_20251020175347.r b/.history/eohi1/regression e1 - ehi x sex x age_20251020175347.r deleted file mode 100644 index 0463ae9..0000000 --- a/.history/eohi1/regression e1 - ehi x sex x age_20251020175347.r +++ /dev/null @@ -1,96 +0,0 @@ -options(scipen = 999) - -library(dplyr) -library(car) -library(lmtest) -library(stargazer) -library(sandwich) -library(lmtest) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1) %>% - filter(demo_sex != "Prefer not to say") - -str(data) -colSums(is.na(data)) -sapply(data, class) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -#### REGRESSION MODELS #### -# MODEL 1: Age only - EOHI -age_DGEN <- lm(eohiDGEN_mean ~ age_centered, data = data) -par(mfrow = c(2, 2)) -plot(age_DGEN) -shapiro.test(residuals(age_DGEN)) -print(summary(age_DGEN)) -print(AIC(age_DGEN)) - -# MODEL 1: Age only - EHI -age_domain <- lm(ehi_global_mean ~ age_centered, data = data) -par(mfrow = c(2, 2)) -plot(age_domain) -shapiro.test(residuals(age_domain)) -print(summary(age_domain)) -print(AIC(age_domain)) - -# MODEL 2: Sex only - EOHI -sex_DGEN <- lm(eohiDGEN_mean ~ sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(sex_DGEN) -shapiro.test(residuals(sex_DGEN)) -print(summary(sex_DGEN)) -print(AIC(sex_DGEN)) - -# MODEL 2: Sex only - EHI -sex_domain <- lm(ehi_global_mean ~ sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(sex_domain) -shapiro.test(residuals(sex_domain)) -print(summary(sex_domain)) -print(AIC(sex_domain)) - -# MODEL 3: Age + Sex + Interaction - EOHI -interaction_DGEN <- lm(eohiDGEN_mean ~ age_centered + sex_dummy + age_centered:sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(interaction_DGEN) -shapiro.test(residuals(interaction_DGEN)) -vif(interaction_DGEN) -print(summary(interaction_DGEN)) -print(AIC(interaction_DGEN)) - -# MODEL 3: Age + Sex + Interaction - EHI -interaction_domain <- lm(ehi_global_mean ~ age_centered + sex_dummy + age_centered:sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(interaction_domain) -shapiro.test(residuals(interaction_domain)) -vif(interaction_domain) -print(summary(interaction_domain)) -print(AIC(interaction_domain)) \ No newline at end of file diff --git a/.history/eohi1/regression e1 - ehi x sex x age_20251020180330.r b/.history/eohi1/regression e1 - ehi x sex x age_20251020180330.r deleted file mode 100644 index f060580..0000000 --- a/.history/eohi1/regression e1 - ehi x sex x age_20251020180330.r +++ /dev/null @@ -1,96 +0,0 @@ -options(scipen = 999) - -library(dplyr) -library(car) -library(lmtest) -library(stargazer) -library(sandwich) -library(lmtest) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1) %>% - filter(demo_sex != "Prefer not to say") - -str(data) -colSums(is.na(data)) -sapply(data, class) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -#### REGRESSION MODELS #### -# MODEL 1: Age only - EOHI -age_DGEN <- lm(eohiDGEN_mean ~ age_centered, data = data) -par(mfrow = c(2, 2)) -plot(age_DGEN) -print(shapiro.test(residuals(age_DGEN))) -print(summary(age_DGEN)) -print(AIC(age_DGEN)) - -# MODEL 1: Age only - EHI -age_domain <- lm(ehi_global_mean ~ age_centered, data = data) -par(mfrow = c(2, 2)) -plot(age_domain) -print(shapiro.test(residuals(age_domain))) -print(summary(age_domain)) -print(AIC(age_domain)) - -# MODEL 2: Sex only - EOHI -sex_DGEN <- lm(eohiDGEN_mean ~ sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(sex_DGEN) -print(shapiro.test(residuals(sex_DGEN))) -print(summary(sex_DGEN)) -print(AIC(sex_DGEN)) - -# MODEL 2: Sex only - EHI -sex_domain <- lm(ehi_global_mean ~ sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(sex_domain) -print(shapiro.test(residuals(sex_domain))) -print(summary(sex_domain)) -print(AIC(sex_domain)) - -# MODEL 3: Age + Sex + Interaction - EOHI -interaction_DGEN <- lm(eohiDGEN_mean ~ age_centered + sex_dummy + age_centered:sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(interaction_DGEN) -print(shapiro.test(residuals(interaction_DGEN))) -vif(interaction_DGEN) -print(summary(interaction_DGEN)) -print(AIC(interaction_DGEN)) - -# MODEL 3: Age + Sex + Interaction - EHI -interaction_domain <- lm(ehi_global_mean ~ age_centered + sex_dummy + age_centered:sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(interaction_domain) -print(shapiro.test(residuals(interaction_domain))) -vif(interaction_domain) -print(summary(interaction_domain)) -print(AIC(interaction_domain)) \ No newline at end of file diff --git a/.history/eohi1/regression e1 - ehi x sex x age_20251021102925.r b/.history/eohi1/regression e1 - ehi x sex x age_20251021102925.r deleted file mode 100644 index 62a3f43..0000000 --- a/.history/eohi1/regression e1 - ehi x sex x age_20251021102925.r +++ /dev/null @@ -1,100 +0,0 @@ -options(scipen = 999) - -library(dplyr) -library(car) -library(lmtest) -library(stargazer) -library(sandwich) -library(lmtest) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1) %>% - filter(demo_sex != "Prefer not to say") - -str(data) -colSums(is.na(data)) -sapply(data, class) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -#### REGRESSION MODELS #### -# MODEL 1: Age only - EOHI -age_DGEN <- lm(eohiDGEN_mean ~ age_centered, data = data) -par(mfrow = c(2, 2)) -plot(age_DGEN) -print(shapiro.test(residuals(age_DGEN))) -print(summary(age_DGEN)) -print(AIC(age_DGEN)) - -# MODEL 1: Age only - EHI -age_domain <- lm(ehi_global_mean ~ age_centered, data = data) -par(mfrow = c(2, 2)) -plot(age_domain) -print(shapiro.test(residuals(age_domain))) -print(summary(age_domain)) -print(AIC(age_domain)) - -# MODEL 2: Sex only - EOHI -sex_DGEN <- lm(eohiDGEN_mean ~ sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(sex_DGEN) -print(shapiro.test(residuals(sex_DGEN))) -print(summary(sex_DGEN)) -print(AIC(sex_DGEN)) -# P1 (res vs fitted) + P3 (scale location): test for homoscedasticity. relatively flat red line = homoscedasticity. relatively scattered points = homoscedasticity. this assumption is met. -# P2 (qq plot): test for normality. points scattered around a relatively straight line = normality. this assumption is violated but large sample is robust. -# P4 (residuals vs leverage): test for outliers. high leverage points = outliers. leverage > 2p/n. - # p = parameters; for this model p = 2 (intercept + sex_dummy). n = 1061 (removed prefer not to say). threshold = 2*2/1061 = 0.00377. maximum leverage in plot is ~ 0.002 therefore no points have concerning leverage. -# across the plots, there are 3 outliers: 258, 670, 872. this represents 0.28% of the data (much less than the acceptable threshold of 5%). therefore, analysis can proceed. -# MODEL 2: Sex only - EHI -sex_domain <- lm(ehi_global_mean ~ sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(sex_domain) -print(shapiro.test(residuals(sex_domain))) -print(summary(sex_domain)) -print(AIC(sex_domain)) - -# MODEL 3: Age + Sex + Interaction - EOHI -interaction_DGEN <- lm(eohiDGEN_mean ~ age_centered + sex_dummy + age_centered:sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(interaction_DGEN) -print(shapiro.test(residuals(interaction_DGEN))) -vif(interaction_DGEN) -print(summary(interaction_DGEN)) -print(AIC(interaction_DGEN)) - -# MODEL 3: Age + Sex + Interaction - EHI -interaction_domain <- lm(ehi_global_mean ~ age_centered + sex_dummy + age_centered:sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(interaction_domain) -print(shapiro.test(residuals(interaction_domain))) -vif(interaction_domain) -print(summary(interaction_domain)) -print(AIC(interaction_domain)) \ No newline at end of file diff --git a/.history/eohi1/regression e1 - ehi x sex x age_20251021104421.r b/.history/eohi1/regression e1 - ehi x sex x age_20251021104421.r deleted file mode 100644 index 29c9873..0000000 --- a/.history/eohi1/regression e1 - ehi x sex x age_20251021104421.r +++ /dev/null @@ -1,101 +0,0 @@ -options(scipen = 999) - -library(dplyr) -library(car) -library(lmtest) -library(stargazer) -library(sandwich) -library(lmtest) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1) %>% - filter(demo_sex != "Prefer not to say") - -str(data) -colSums(is.na(data)) -sapply(data, class) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -#### REGRESSION MODELS #### -# MODEL 1: Age only - EOHI -age_DGEN <- lm(eohiDGEN_mean ~ age_centered, data = data) -par(mfrow = c(2, 2)) -plot(age_DGEN) -print(shapiro.test(residuals(age_DGEN))) -print(summary(age_DGEN)) -print(AIC(age_DGEN)) - -# MODEL 1: Age only - EHI -age_domain <- lm(ehi_global_mean ~ age_centered, data = data) -par(mfrow = c(2, 2)) -plot(age_domain) -print(shapiro.test(residuals(age_domain))) -print(summary(age_domain)) -print(AIC(age_domain)) - -# MODEL 2: Sex only - EOHI -sex_DGEN <- lm(eohiDGEN_mean ~ sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(sex_DGEN) -print(shapiro.test(residuals(sex_DGEN))) -print(summary(sex_DGEN)) -print(AIC(sex_DGEN)) -# P1 (res vs fitted) + P3 (scale location): test for homoscedasticity. relatively flat red line = homoscedasticity. relatively scattered points = homoscedasticity. this assumption is met. -# P2 (qq plot): test for normality. points scattered around a relatively straight line = normality. this assumption is violated but large sample is robust. -# P4 (residuals vs leverage): test for outliers. high leverage points = outliers. leverage > 2p/n. - # p = parameters; for this model p = 2 (intercept + sex_dummy). n = 1061 (removed prefer not to say). threshold = 2*2/1061 = 0.00377. maximum leverage in plot is ~ 0.002 therefore no points have concerning leverage. -# across the plots, there are 3 outliers: 258, 670, 872. this represents 0.28% of the data (much less than the acceptable threshold of 5%). therefore, analysis can proceed. - -# MODEL 2: Sex only - EHI -sex_domain <- lm(ehi_global_mean ~ sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(sex_domain) -print(shapiro.test(residuals(sex_domain))) -print(summary(sex_domain)) -print(AIC(sex_domain)) - -# MODEL 3: Age + Sex + Interaction - EOHI -interaction_DGEN <- lm(eohiDGEN_mean ~ age_centered + sex_dummy + age_centered:sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(interaction_DGEN) -print(shapiro.test(residuals(interaction_DGEN))) -vif(interaction_DGEN) -print(summary(interaction_DGEN)) -print(AIC(interaction_DGEN)) - -# MODEL 3: Age + Sex + Interaction - EHI -interaction_domain <- lm(ehi_global_mean ~ age_centered + sex_dummy + age_centered:sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(interaction_domain) -print(shapiro.test(residuals(interaction_domain))) -vif(interaction_domain) -print(summary(interaction_domain)) -print(AIC(interaction_domain)) \ No newline at end of file diff --git a/.history/eohi1/regression e1 - ehi x sex x age_20251021111526.r b/.history/eohi1/regression e1 - ehi x sex x age_20251021111526.r deleted file mode 100644 index 52fd677..0000000 --- a/.history/eohi1/regression e1 - ehi x sex x age_20251021111526.r +++ /dev/null @@ -1,105 +0,0 @@ -options(scipen = 999) - -library(dplyr) -library(car) -library(lmtest) -library(stargazer) -library(sandwich) -library(lmtest) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1) %>% - filter(demo_sex != "Prefer not to say") - -str(data) -colSums(is.na(data)) -sapply(data, class) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -#### REGRESSION MODELS #### -# MODEL 1: Age only - EOHI -age_DGEN <- lm(eohiDGEN_mean ~ age_centered, data = data) -par(mfrow = c(2, 2)) -plot(age_DGEN) -print(shapiro.test(residuals(age_DGEN))) -print(summary(age_DGEN)) -print(AIC(age_DGEN)) - -# MODEL 1: Age only - EHI -age_domain <- lm(ehi_global_mean ~ age_centered, data = data) -par(mfrow = c(2, 2)) -plot(age_domain) -print(shapiro.test(residuals(age_domain))) -print(summary(age_domain)) -print(AIC(age_domain)) - -# MODEL 2: Sex only - EOHI -sex_DGEN <- lm(eohiDGEN_mean ~ sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(sex_DGEN) -print(shapiro.test(residuals(sex_DGEN))) -print(summary(sex_DGEN)) -print(AIC(sex_DGEN)) -# P1 (res vs fitted) + P3 (scale location): test for homoscedasticity. relatively flat red line = homoscedasticity. relatively scattered points = homoscedasticity. this assumption is met. -# P2 (qq plot): test for normality. points scattered around a relatively straight line = normality. this assumption is violated but large sample is robust. -# P4 (residuals vs leverage): test for outliers. high leverage points = outliers. leverage > 2p/n. - # p = parameters; for this model p = 2 (intercept + sex_dummy). n = 1061 (removed prefer not to say). threshold = 2*2/1061 = 0.00377. maximum leverage in plot is ~ 0.002 therefore no points have concerning leverage. -# across the plots, there are 3 outliers: 258, 670, 872. this represents 0.28% of the data (much less than the acceptable threshold of 5%). therefore, analysis can proceed. - -# MODEL 2: Sex only - EHI -sex_domain <- lm(ehi_global_mean ~ sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(sex_domain) -print(shapiro.test(residuals(sex_domain))) -print(summary(sex_domain)) -print(AIC(sex_domain)) - -# MODEL 3: Age + Sex + Interaction - EOHI -interaction_DGEN <- lm(eohiDGEN_mean ~ age_centered + sex_dummy + age_centered:sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(interaction_DGEN) -print(shapiro.test(residuals(interaction_DGEN))) -print(vif(interaction_DGEN)) -print(summary(interaction_DGEN)) -print(AIC(interaction_DGEN)) - -# MODEL 3: Age + Sex + Interaction - EHI -interaction_domain <- lm(ehi_global_mean ~ age_centered + sex_dummy + age_centered:sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(interaction_domain) -print(shapiro.test(residuals(interaction_domain))) -vif(interaction_domain) -print(summary(interaction_domain)) -print(AIC(interaction_domain)) - -#### troubleshooting #### -# Clear any existing plots -dev.off() \ No newline at end of file diff --git a/.history/eohi1/regression e1 - ehi x sex x age_20251021120315.r b/.history/eohi1/regression e1 - ehi x sex x age_20251021120315.r deleted file mode 100644 index e93fea0..0000000 --- a/.history/eohi1/regression e1 - ehi x sex x age_20251021120315.r +++ /dev/null @@ -1,151 +0,0 @@ -options(scipen = 999) - -library(dplyr) -library(car) -library(lmtest) -library(stargazer) -library(sandwich) -library(lmtest) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1) %>% - filter(demo_sex != "Prefer not to say") - -str(data) -colSums(is.na(data)) -sapply(data, class) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -#### REGRESSION MODELS #### -# MODEL 1: Age only - EOHI -age_DGEN <- lm(eohiDGEN_mean ~ age_centered, data = data) -par(mfrow = c(2, 2)) -plot(age_DGEN) -print(shapiro.test(residuals(age_DGEN))) -print(summary(age_DGEN)) -print(AIC(age_DGEN)) - -# MODEL 1: Age only - EHI -age_domain <- lm(ehi_global_mean ~ age_centered, data = data) -par(mfrow = c(2, 2)) -plot(age_domain) -print(shapiro.test(residuals(age_domain))) -print(summary(age_domain)) -print(AIC(age_domain)) - -# MODEL 2: Sex only - EOHI -sex_DGEN <- lm(eohiDGEN_mean ~ sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(sex_DGEN) -print(shapiro.test(residuals(sex_DGEN))) -print(summary(sex_DGEN)) -print(AIC(sex_DGEN)) -# P1 (res vs fitted) + P3 (scale location): test for homoscedasticity. relatively flat red line = homoscedasticity. relatively scattered points = homoscedasticity. this assumption is met. -# P2 (qq plot): test for normality. points scattered around a relatively straight line = normality. this assumption is violated but large sample is robust. -# P4 (residuals vs leverage): test for outliers. high leverage points = outliers. leverage > 2p/n. - # p = parameters; for this model p = 2 (intercept + sex_dummy). n = 1061 (removed prefer not to say). threshold = 2*2/1061 = 0.00377. maximum leverage in plot is ~ 0.002 therefore no points have concerning leverage. -# across the plots, there are 3 outliers: 258, 670, 872. this represents 0.28% of the data (much less than the acceptable threshold of 5%). therefore, analysis can proceed. - -# MODEL 2: Sex only - EHI -sex_domain <- lm(ehi_global_mean ~ sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(sex_domain) -print(shapiro.test(residuals(sex_domain))) -print(summary(sex_domain)) -print(AIC(sex_domain)) - -# MODEL 3: Age + Sex + Interaction - EOHI -interaction_DGEN <- lm(eohiDGEN_mean ~ age_centered + sex_dummy + age_centered:sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(interaction_DGEN) -print(shapiro.test(residuals(interaction_DGEN))) -vif_DGEN <- vif(interaction_DGEN) -print(vif_DGEN) -print(summary(interaction_DGEN)) -print(AIC(interaction_DGEN)) - -# MODEL 3: Age + Sex + Interaction - EHI -interaction_domain <- lm(ehi_global_mean ~ age_centered + sex_dummy + age_centered:sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(interaction_domain) -print(shapiro.test(residuals(interaction_domain))) -vif_domain <- vif(interaction_domain) -print(vif_domain) -print(summary(interaction_domain)) -print(AIC(interaction_domain)) - -#### troubleshooting #### -# Clear any existing plots -# dev.off() - -#### printing to HTML #### - -# Create an HTML table for your model results -max_vif_DGEN <- round(max(vif_DGEN), 2) - -stargazer( - age_DGEN, sex_DGEN, interaction_DGEN, - type = "html", - out = "regression_DGEN_models.html", - title = "Regression Models: EOHI-DGEN", - dep.var.labels = "EOHI-DGEN Mean", - covariate.labels = c("Age (centered)", "Sex (dummy)", "Age x Sex"), - report = "vcsp*", - add.lines = list( - c("AIC", - round(AIC(age_DGEN), 2), - round(AIC(sex_DGEN), 2), - round(AIC(interaction_DGEN), 2)), - c("Max VIF", "N/A", "N/A", max_vif_DGEN) - ) -) - -max_vif_domain <- round(max(vif_domain), 2) - -stargazer( - age_domain, sex_domain, interaction_domain, - type = "html", - out = "regression_EHI_domain_models.html", - title = "Regression Models: EHI Domain", - dep.var.labels = "EHI Domain Mean", - covariate.labels = c("Age (centered)", "Sex (dummy)", "Age x Sex"), - report = "vcsp*", - add.lines = list( - c("AIC", - round(AIC(age_domain), 2), - round(AIC(sex_domain), 2), - round(AIC(interaction_domain), 2)), - c("Max VIF", - "NA", - "NA", - max_vif_domain) - ) -) diff --git a/.history/eohi1/regression e1 - ehi x sex x age_20251023105759.r b/.history/eohi1/regression e1 - ehi x sex x age_20251023105759.r deleted file mode 100644 index e7db6aa..0000000 --- a/.history/eohi1/regression e1 - ehi x sex x age_20251023105759.r +++ /dev/null @@ -1,320 +0,0 @@ -options(scipen = 999) - -library(dplyr) -library(car) -library(lmtest) -library(stargazer) -library(sandwich) -library(lmtest) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1) %>% - filter(demo_sex != "Prefer not to say") - -str(data) -colSums(is.na(data)) -sapply(data, class) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -#### REGRESSION MODELS #### -# MODEL 1: Age only - EOHI -age_DGEN <- lm(eohiDGEN_mean ~ age_centered, data = data) -par(mfrow = c(2, 2)) -plot(age_DGEN) -print(shapiro.test(residuals(age_DGEN))) -print(summary(age_DGEN)) -print(AIC(age_DGEN)) - -# MODEL 1: Age only - EHI -age_domain <- lm(ehi_global_mean ~ age_centered, data = data) -par(mfrow = c(2, 2)) -plot(age_domain) -print(shapiro.test(residuals(age_domain))) -print(summary(age_domain)) -print(AIC(age_domain)) - -# MODEL 2: Sex only - EOHI -sex_DGEN <- lm(eohiDGEN_mean ~ sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(sex_DGEN) -print(shapiro.test(residuals(sex_DGEN))) -print(summary(sex_DGEN)) -print(AIC(sex_DGEN)) -# P1 (res vs fitted) + P3 (scale location): test for homoscedasticity. relatively flat red line = homoscedasticity. relatively scattered points = homoscedasticity. this assumption is met. -# P2 (qq plot): test for normality. points scattered around a relatively straight line = normality. this assumption is violated but large sample is robust. -# P4 (residuals vs leverage): test for outliers. high leverage points = outliers. leverage > 2p/n. - # p = parameters; for this model p = 2 (intercept + sex_dummy). n = 1061 (removed prefer not to say). threshold = 2*2/1061 = 0.00377. maximum leverage in plot is ~ 0.002 therefore no points have concerning leverage. -# across the plots, there are 3 outliers: 258, 670, 872. this represents 0.28% of the data (much less than the acceptable threshold of 5%). therefore, analysis can proceed. - -# MODEL 2: Sex only - EHI -sex_domain <- lm(ehi_global_mean ~ sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(sex_domain) -print(shapiro.test(residuals(sex_domain))) -print(summary(sex_domain)) -print(AIC(sex_domain)) - -# MODEL 3: Age + Sex + Interaction - EOHI -interaction_DGEN <- lm(eohiDGEN_mean ~ age_centered + sex_dummy + age_centered:sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(interaction_DGEN) -print(shapiro.test(residuals(interaction_DGEN))) -vif_DGEN <- vif(interaction_DGEN) -print(vif_DGEN) -print(summary(interaction_DGEN)) -print(AIC(interaction_DGEN)) - -# MODEL 3: Age + Sex + Interaction - EHI -interaction_domain <- lm(ehi_global_mean ~ age_centered + sex_dummy + age_centered:sex_dummy, data = data) -par(mfrow = c(2, 2)) -plot(interaction_domain) -print(shapiro.test(residuals(interaction_domain))) -vif_domain <- vif(interaction_domain) -print(vif_domain) -print(summary(interaction_domain)) -print(AIC(interaction_domain)) - -#### troubleshooting #### -# Clear any existing plots -# dev.off() - - -#### PLOTS #### - -# Create visual figures for age models -library(ggplot2) - -# Figure 1: age_DGEN model -p1 <- ggplot(data, aes(x = age_centered, y = eohiDGEN_mean)) + - geom_point(alpha = 0.6, color = "steelblue") + - geom_smooth(method = "lm", se = TRUE, color = "red", linewidth = 1) + - labs( - title = "Age and EOHI-DGEN Relationship", - x = "Age (centered)", - y = "EOHI-DGEN Mean", - subtitle = paste("R² =", round(summary(age_DGEN)$r.squared, 3), - ", p < 0.001") - ) + - theme_minimal() + - theme( - plot.title = element_text(size = 14, face = "bold"), - axis.title = element_text(size = 12), - plot.subtitle = element_text(size = 10, color = "gray60") - ) - -# Figure 2: age_domain model -p2 <- ggplot(data, aes(x = age_centered, y = ehi_global_mean)) + - geom_point(alpha = 0.6, color = "darkgreen") + - geom_smooth(method = "lm", se = TRUE, color = "red", linewidth = 1) + - labs( - title = "Age and EHI Domain Relationship", - x = "Age (centered)", - y = "EHI Domain Mean", - subtitle = paste("R² =", round(summary(age_domain)$r.squared, 3), - ", p < 0.001") - ) + - theme_minimal() + - theme( - plot.title = element_text(size = 14, face = "bold"), - axis.title = element_text(size = 12), - plot.subtitle = element_text(size = 10, color = "gray60") - ) - -# Save the plots -ggsave("age_DGEN_plot.png", p1, width = 8, height = 6, dpi = 300) -ggsave("age_domain_plot.png", p2, width = 8, height = 6, dpi = 300) - -# Display the plots -print(p1) -print(p2) - -#### HTML file #### - -# Create comprehensive HTML report grouped by model -library(htmltools) - -# Start HTML document -html_content <- htmltools::div( - htmltools::h1("Regression Analysis: Age and Sex Effects on EOHI-DGEN and EHI Domain"), - - # EOHI-DGEN Models Section - htmltools::h2("EOHI-DGEN Models"), - - # Age Model - htmltools::h3("1. Age Model (age_DGEN)"), - htmltools::div( - style = "margin-bottom: 30px;", - htmltools::h4("Model Results, AIC, and VIF"), - htmltools::HTML( - stargazer( - age_DGEN, - type = "html", - title = "Age Model: EOHI-DGEN", - dep.var.labels = "EOHI-DGEN Mean", - covariate.labels = c("Age (centered)"), - report = "vcsp*", - add.lines = list( - c("AIC", round(AIC(age_DGEN), 2)), - c("Max VIF", "N/A") - ) - ) - ), - htmltools::h4("Assumption Diagnostic Plots"), - htmltools::img(src = "age_DGEN_assumptions.png", style = "width: 100%; max-width: 800px;"), - htmltools::h4("Model Visualization"), - htmltools::img(src = "age_DGEN_plot.png", style = "width: 100%; max-width: 800px;") - ), - - # Sex Model - htmltools::h3("2. Sex Model (sex_DGEN)"), - htmltools::div( - style = "margin-bottom: 30px;", - htmltools::h4("Model Results, AIC, and VIF"), - htmltools::HTML( - stargazer( - sex_DGEN, - type = "html", - title = "Sex Model: EOHI-DGEN", - dep.var.labels = "EOHI-DGEN Mean", - covariate.labels = c("Sex (dummy)"), - report = "vcsp*", - add.lines = list( - c("AIC", round(AIC(sex_DGEN), 2)), - c("Max VIF", "N/A") - ) - ) - ), - htmltools::h4("Assumption Diagnostic Plots"), - htmltools::img(src = "sex_DGEN_assumptions.png", style = "width: 100%; max-width: 800px;") - ), - - # Interaction Model - htmltools::h3("3. Interaction Model (interaction_DGEN)"), - htmltools::div( - style = "margin-bottom: 30px;", - htmltools::h4("Model Results, AIC, and VIF"), - htmltools::HTML( - stargazer( - interaction_DGEN, - type = "html", - title = "Interaction Model: EOHI-DGEN", - dep.var.labels = "EOHI-DGEN Mean", - covariate.labels = c("Age (centered)", "Sex (dummy)", "Age x Sex"), - report = "vcsp*", - add.lines = list( - c("AIC", round(AIC(interaction_DGEN), 2)), - c("Max VIF", max_vif_DGEN) - ) - ) - ), - htmltools::h4("Assumption Diagnostic Plots"), - htmltools::img(src = "interaction_DGEN_assumptions.png", style = "width: 100%; max-width: 800px;") - ), - - # EHI Domain Models Section - htmltools::h2("EHI Domain Models"), - - # Age Model - htmltools::h3("1. Age Model (age_domain)"), - htmltools::div( - style = "margin-bottom: 30px;", - htmltools::h4("Model Results, AIC, and VIF"), - htmltools::HTML( - stargazer( - age_domain, - type = "html", - title = "Age Model: EHI Domain", - dep.var.labels = "EHI Domain Mean", - covariate.labels = c("Age (centered)"), - report = "vcsp*", - add.lines = list( - c("AIC", round(AIC(age_domain), 2)), - c("Max VIF", "N/A") - ) - ) - ), - htmltools::h4("Assumption Diagnostic Plots"), - htmltools::img(src = "age_domain_assumptions.png", style = "width: 100%; max-width: 800px;"), - htmltools::h4("Model Visualization"), - htmltools::img(src = "age_domain_plot.png", style = "width: 100%; max-width: 800px;") - ), - - # Sex Model - htmltools::h3("2. Sex Model (sex_domain)"), - htmltools::div( - style = "margin-bottom: 30px;", - htmltools::h4("Model Results, AIC, and VIF"), - htmltools::HTML( - stargazer( - sex_domain, - type = "html", - title = "Sex Model: EHI Domain", - dep.var.labels = "EHI Domain Mean", - covariate.labels = c("Sex (dummy)"), - report = "vcsp*", - add.lines = list( - c("AIC", round(AIC(sex_domain), 2)), - c("Max VIF", "N/A") - ) - ) - ), - htmltools::h4("Assumption Diagnostic Plots"), - htmltools::img(src = "sex_domain_assumptions.png", style = "width: 100%; max-width: 800px;") - ), - - # Interaction Model - htmltools::h3("3. Interaction Model (interaction_domain)"), - htmltools::div( - style = "margin-bottom: 30px;", - htmltools::h4("Model Results, AIC, and VIF"), - htmltools::HTML( - stargazer( - interaction_domain, - type = "html", - title = "Interaction Model: EHI Domain", - dep.var.labels = "EHI Domain Mean", - covariate.labels = c("Age (centered)", "Sex (dummy)", "Age x Sex"), - report = "vcsp*", - add.lines = list( - c("AIC", round(AIC(interaction_domain), 2)), - c("Max VIF", max_vif_domain) - ) - ) - ), - htmltools::h4("Assumption Diagnostic Plots"), - htmltools::img(src = "interaction_domain_assumptions.png", style = "width: 100%; max-width: 800px;") - ) -) - -# Save HTML content -htmltools::save_html(html_content, "regression_analysis_report.html") - -print("HTML report created: regression_analysis_report.html") \ No newline at end of file diff --git a/.history/eohi1/regressions e1 - assumptions_20251015154931.r b/.history/eohi1/regressions e1 - assumptions_20251015154931.r deleted file mode 100644 index 036e881..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251015154931.r +++ /dev/null @@ -1,3 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016134509.r b/.history/eohi1/regressions e1 - assumptions_20251016134509.r deleted file mode 100644 index a72cede..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016134509.r +++ /dev/null @@ -1,4 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016142437.r b/.history/eohi1/regressions e1 - assumptions_20251016142437.r deleted file mode 100644 index ecc6191..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016142437.r +++ /dev/null @@ -1,55 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean -# Total: 6 regression models - -options(scipen = 999) - -# Load required libraries -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check data structure -cat("Data dimensions:", dim(data), "\n") -cat("Variables of interest:\n") -cat("IVs: demo_sex, demo_age, demo_edu\n") -cat("DVs: eohiDGEN_mean, ehi_global_mean\n\n") - -# Check for missing values -cat("Missing values check:\n") -missing_summary <- data %>% - select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -cat("\nClean data dimensions:", dim(data_clean), "\n") - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -cat("\nEducation levels:\n") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Verify recoding -cat("\nSex recoding (0=Female, 1=Male):\n") -print(table(data_clean$demo_sex_numeric)) -cat("\nEducation recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):\n") -print(table(data_clean$demo_edu_numeric)) diff --git a/.history/eohi1/regressions e1 - assumptions_20251016142502.r b/.history/eohi1/regressions e1 - assumptions_20251016142502.r deleted file mode 100644 index 1030c19..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016142502.r +++ /dev/null @@ -1,235 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean -# Total: 6 regression models - -options(scipen = 999) - -# Load required libraries -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check data structure -cat("Data dimensions:", dim(data), "\n") -cat("Variables of interest:\n") -cat("IVs: demo_sex, demo_age, demo_edu\n") -cat("DVs: eohiDGEN_mean, ehi_global_mean\n\n") - -# Check for missing values -cat("Missing values check:\n") -missing_summary <- data %>% - select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -cat("\nClean data dimensions:", dim(data_clean), "\n") - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -cat("\nEducation levels:\n") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Verify recoding -cat("\nSex recoding (0=Female, 1=Male):\n") -print(table(data_clean$demo_sex_numeric)) -cat("\nEducation recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):\n") -print(table(data_clean$demo_edu_numeric)) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - cat("\n=== LINEARITY CHECK:", model_name, "===\n") - - # Residuals vs Fitted plot - residuals_vs_fitted <- plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - cr_plot <- crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - # Return plots for later use - return(list(residuals_vs_fitted = residuals_vs_fitted, cr_plot = cr_plot)) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - cat("\n=== NORMALITY CHECK:", model_name, "===\n") - - # Q-Q plot - qq_plot <- plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - cat("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5), "\n") - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - cat("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5), "\n") - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - - return(list(qq_plot = qq_plot, hist_plot = hist_plot, - shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - cat("\n=== HOMOSCEDASTICITY CHECK:", model_name, "===\n") - - # Scale-Location plot - scale_location_plot <- plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - cat("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5), "\n") - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - cat("White test p-value:", format(white_test$p.value, digits = 5), "\n") - }, error = function(e) { - cat("White test not available for this model\n") - }) - - return(list(scale_location_plot = scale_location_plot, bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - cat("\n=== INDEPENDENCE CHECK:", model_name, "===\n") - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - cat("Durbin-Watson statistic:", format(dw_test$dw, digits = 5), "\n") - cat("Durbin-Watson p-value:", format(dw_test$p, digits = 5), "\n") - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = 1:length(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - - return(list(residuals_vs_order = residuals_vs_order, dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - cat("\n=== INFLUENCE CHECK:", model_name, "===\n") - - # Cook's Distance plot - cooks_plot <- plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - influence_measures <- influence.measures(model) - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - cat("Cook's Distance threshold:", format(cooks_threshold, digits = 5), "\n") - cat("Influential observations (Cook's D):", length(influential_cooks), "\n") - cat("Leverage threshold:", format(leverage_threshold, digits = 5), "\n") - cat("High leverage observations:", length(influential_leverage), "\n") - cat("DFFITS threshold:", format(dffits_threshold, digits = 5), "\n") - cat("Influential observations (DFFITS):", length(influential_dffits), "\n") - - if (length(influential_cooks) > 0) { - cat("Cook's D influential cases:", influential_cooks, "\n") - } - if (length(influential_leverage) > 0) { - cat("High leverage cases:", influential_leverage, "\n") - } - if (length(influential_dffits) > 0) { - cat("DFFITS influential cases:", influential_dffits, "\n") - } - - return(list(cooks_plot = cooks_plot, - influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - cat("\n=== MODEL SUMMARY:", model_name, "===\n") - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - cat("\nR-squared:", format(summary_model$r.squared, digits = 5), "\n") - cat("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5), "\n") - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - cat("AIC:", format(aic_val, digits = 5), "\n") - cat("BIC:", format(bic_val, digits = 5), "\n") - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} diff --git a/.history/eohi1/regressions e1 - assumptions_20251016142529.r b/.history/eohi1/regressions e1 - assumptions_20251016142529.r deleted file mode 100644 index 77efdb7..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016142529.r +++ /dev/null @@ -1,401 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean -# Total: 6 regression models - -options(scipen = 999) - -# Load required libraries -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check data structure -cat("Data dimensions:", dim(data), "\n") -cat("Variables of interest:\n") -cat("IVs: demo_sex, demo_age, demo_edu\n") -cat("DVs: eohiDGEN_mean, ehi_global_mean\n\n") - -# Check for missing values -cat("Missing values check:\n") -missing_summary <- data %>% - select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -cat("\nClean data dimensions:", dim(data_clean), "\n") - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -cat("\nEducation levels:\n") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Verify recoding -cat("\nSex recoding (0=Female, 1=Male):\n") -print(table(data_clean$demo_sex_numeric)) -cat("\nEducation recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):\n") -print(table(data_clean$demo_edu_numeric)) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - cat("\n=== LINEARITY CHECK:", model_name, "===\n") - - # Residuals vs Fitted plot - residuals_vs_fitted <- plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - cr_plot <- crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - # Return plots for later use - return(list(residuals_vs_fitted = residuals_vs_fitted, cr_plot = cr_plot)) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - cat("\n=== NORMALITY CHECK:", model_name, "===\n") - - # Q-Q plot - qq_plot <- plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - cat("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5), "\n") - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - cat("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5), "\n") - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - - return(list(qq_plot = qq_plot, hist_plot = hist_plot, - shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - cat("\n=== HOMOSCEDASTICITY CHECK:", model_name, "===\n") - - # Scale-Location plot - scale_location_plot <- plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - cat("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5), "\n") - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - cat("White test p-value:", format(white_test$p.value, digits = 5), "\n") - }, error = function(e) { - cat("White test not available for this model\n") - }) - - return(list(scale_location_plot = scale_location_plot, bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - cat("\n=== INDEPENDENCE CHECK:", model_name, "===\n") - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - cat("Durbin-Watson statistic:", format(dw_test$dw, digits = 5), "\n") - cat("Durbin-Watson p-value:", format(dw_test$p, digits = 5), "\n") - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = 1:length(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - - return(list(residuals_vs_order = residuals_vs_order, dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - cat("\n=== INFLUENCE CHECK:", model_name, "===\n") - - # Cook's Distance plot - cooks_plot <- plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - influence_measures <- influence.measures(model) - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - cat("Cook's Distance threshold:", format(cooks_threshold, digits = 5), "\n") - cat("Influential observations (Cook's D):", length(influential_cooks), "\n") - cat("Leverage threshold:", format(leverage_threshold, digits = 5), "\n") - cat("High leverage observations:", length(influential_leverage), "\n") - cat("DFFITS threshold:", format(dffits_threshold, digits = 5), "\n") - cat("Influential observations (DFFITS):", length(influential_dffits), "\n") - - if (length(influential_cooks) > 0) { - cat("Cook's D influential cases:", influential_cooks, "\n") - } - if (length(influential_leverage) > 0) { - cat("High leverage cases:", influential_leverage, "\n") - } - if (length(influential_dffits) > 0) { - cat("DFFITS influential cases:", influential_dffits, "\n") - } - - return(list(cooks_plot = cooks_plot, - influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - cat("\n=== MODEL SUMMARY:", model_name, "===\n") - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - cat("\nR-squared:", format(summary_model$r.squared, digits = 5), "\n") - cat("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5), "\n") - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - cat("AIC:", format(aic_val, digits = 5), "\n") - cat("BIC:", format(bic_val, digits = 5), "\n") - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS FOR ALL MODELS -# ============================================================================= - -# Create results storage -assumption_results <- list() -model_summaries <- list() - -# Model names for reference -model_names <- c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global") - -cat("\n", rep("=", 80), "\n") -cat("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS\n") -cat(rep("=", 80), "\n") - -# Run assumption checks for each model -for (i in 1:length(models)) { - model_name <- model_names[i] - model <- models[[i]] - - cat("\n", rep("-", 60), "\n") - cat("ANALYZING MODEL", i, ":", model_name, "\n") - cat(rep("-", 60), "\n") - - # Store results - assumption_results[[i]] <- list() - assumption_results[[i]]$model_name <- model_name - - # 1. Model Summary - model_summaries[[i]] <- get_model_summary(model, model_name) - assumption_results[[i]]$summary <- model_summaries[[i]] - - # 2. Linearity Check - assumption_results[[i]]$linearity <- check_linearity(model, model_name) - - # 3. Normality Check - assumption_results[[i]]$normality <- check_normality(model, model_name) - - # 4. Homoscedasticity Check - assumption_results[[i]]$homoscedasticity <- check_homoscedasticity(model, model_name) - - # 5. Independence Check - assumption_results[[i]]$independence <- check_independence(model, model_name) - - # 6. Influence Check - assumption_results[[i]]$influence <- check_influence(model, model_name) -} - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -cat("\n", rep("=", 80), "\n") -cat("ASSUMPTION VIOLATION SUMMARY\n") -cat(rep("=", 80), "\n") - -# Create summary table -violation_summary <- data.frame( - Model = character(), - Linearity = character(), - Normality = character(), - Homoscedasticity = character(), - Independence = character(), - Influential_Obs = character(), - stringsAsFactors = FALSE -) - -# Populate summary table -for (i in 1:length(models)) { - model_name <- model_names[i] - - # Check normality (Shapiro-Wilk p < 0.05 indicates violation) - normality_violation <- ifelse(assumption_results[[i]]$normality$shapiro_p < 0.05, "VIOLATED", "OK") - - # Check homoscedasticity (Breusch-Pagan p < 0.05 indicates violation) - homosced_violation <- ifelse(assumption_results[[i]]$homoscedasticity$bp_p < 0.05, "VIOLATED", "OK") - - # Check independence (Durbin-Watson p < 0.05 indicates violation) - independence_violation <- ifelse(assumption_results[[i]]$independence$dw_p < 0.05, "VIOLATED", "OK") - - # Check for influential observations - influential_count <- length(assumption_results[[i]]$influence$influential_cooks) - influential_status <- ifelse(influential_count > 0, paste("YES (", influential_count, ")", sep = ""), "NO") - - # Linearity is assessed visually, so we'll mark as "CHECK VISUALLY" - linearity_status <- "CHECK VISUALLY" - - violation_summary <- rbind(violation_summary, data.frame( - Model = model_name, - Linearity = linearity_status, - Normality = normality_violation, - Homoscedasticity = homosced_violation, - Independence = independence_violation, - Influential_Obs = influential_status - )) -} - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -cat("\n", rep("=", 80), "\n") -cat("MODEL COMPARISON SUMMARY\n") -cat(rep("=", 80), "\n") - -# Create model comparison table -comparison_table <- data.frame( - Model = model_names, - R_Squared = numeric(length(models)), - Adj_R_Squared = numeric(length(models)), - AIC = numeric(length(models)), - BIC = numeric(length(models)), - Significant = character(length(models)), - stringsAsFactors = FALSE -) - -for (i in 1:length(models)) { - summary_model <- model_summaries[[i]]$summary - comparison_table$R_Squared[i] <- summary_model$r.squared - comparison_table$Adj_R_Squared[i] <- summary_model$adj.r.squared - comparison_table$AIC[i] <- model_summaries[[i]]$aic - comparison_table$BIC[i] <- model_summaries[[i]]$bic - - # Check if predictor is significant (p < 0.05) - p_value <- summary_model$coefficients[2, 4] - comparison_table$Significant[i] <- ifelse(p_value < 0.05, "YES", "NO") -} - -print(comparison_table) - -# ============================================================================= -# RECOMMENDATIONS -# ============================================================================= - -cat("\n", rep("=", 80), "\n") -cat("RECOMMENDATIONS FOR ASSUMPTION VIOLATIONS\n") -cat(rep("=", 80), "\n") - -cat("\n1. NORMALITY VIOLATIONS:\n") -cat(" - If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox)\n") -cat(" - Alternative: Use robust regression methods or bootstrapping\n") - -cat("\n2. HOMOSCEDASTICITY VIOLATIONS:\n") -cat(" - If violated: Use weighted least squares or robust standard errors\n") -cat(" - Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors\n") - -cat("\n3. INDEPENDENCE VIOLATIONS:\n") -cat(" - If violated: Check for clustering or repeated measures structure\n") -cat(" - Alternative: Use mixed-effects models or clustered standard errors\n") - -cat("\n4. INFLUENTIAL OBSERVATIONS:\n") -cat(" - If present: Examine these cases for data entry errors\n") -cat(" - Consider: Running analysis with and without influential cases\n") -cat(" - Alternative: Use robust regression methods\n") - -cat("\n5. LINEARITY VIOLATIONS:\n") -cat(" - If violated: Add polynomial terms or use splines\n") -cat(" - Alternative: Transform predictors or use non-parametric methods\n") - -cat("\n", rep("=", 80), "\n") -cat("ANALYSIS COMPLETE\n") -cat(rep("=", 80), "\n") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016142552.r b/.history/eohi1/regressions e1 - assumptions_20251016142552.r deleted file mode 100644 index 759d18b..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016142552.r +++ /dev/null @@ -1,401 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean -# Total: 6 regression models - -options(scipen = 999) - -# Load required libraries -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check data structure -cat("Data dimensions:", dim(data), "\n") -cat("Variables of interest:\n") -cat("IVs: demo_sex, demo_age, demo_edu\n") -cat("DVs: eohiDGEN_mean, ehi_global_mean\n\n") - -# Check for missing values -cat("Missing values check:\n") -missing_summary <- data %>% - select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -cat("\nClean data dimensions:", dim(data_clean), "\n") - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -cat("\nEducation levels:\n") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Verify recoding -cat("\nSex recoding (0=Female, 1=Male):\n") -print(table(data_clean$demo_sex_numeric)) -cat("\nEducation recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):\n") -print(table(data_clean$demo_edu_numeric)) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - cat("\n=== LINEARITY CHECK:", model_name, "===\n") - - # Residuals vs Fitted plot - residuals_vs_fitted <- plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - cr_plot <- crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - # Return plots for later use - return(list(residuals_vs_fitted = residuals_vs_fitted, cr_plot = cr_plot)) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - cat("\n=== NORMALITY CHECK:", model_name, "===\n") - - # Q-Q plot - qq_plot <- plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - cat("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5), "\n") - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - cat("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5), "\n") - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - - return(list(qq_plot = qq_plot, hist_plot = hist_plot, - shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - cat("\n=== HOMOSCEDASTICITY CHECK:", model_name, "===\n") - - # Scale-Location plot - scale_location_plot <- plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - cat("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5), "\n") - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - cat("White test p-value:", format(white_test$p.value, digits = 5), "\n") - }, error = function(e) { - cat("White test not available for this model\n") - }) - - return(list(scale_location_plot = scale_location_plot, bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - cat("\n=== INDEPENDENCE CHECK:", model_name, "===\n") - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - cat("Durbin-Watson statistic:", format(dw_test$dw, digits = 5), "\n") - cat("Durbin-Watson p-value:", format(dw_test$p, digits = 5), "\n") - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - - return(list(residuals_vs_order = residuals_vs_order, dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - cat("\n=== INFLUENCE CHECK:", model_name, "===\n") - - # Cook's Distance plot - cooks_plot <- plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - cat("Cook's Distance threshold:", format(cooks_threshold, digits = 5), "\n") - cat("Influential observations (Cook's D):", length(influential_cooks), "\n") - cat("Leverage threshold:", format(leverage_threshold, digits = 5), "\n") - cat("High leverage observations:", length(influential_leverage), "\n") - cat("DFFITS threshold:", format(dffits_threshold, digits = 5), "\n") - cat("Influential observations (DFFITS):", length(influential_dffits), "\n") - - if (length(influential_cooks) > 0) { - cat("Cook's D influential cases:", influential_cooks, "\n") - } - if (length(influential_leverage) > 0) { - cat("High leverage cases:", influential_leverage, "\n") - } - if (length(influential_dffits) > 0) { - cat("DFFITS influential cases:", influential_dffits, "\n") - } - - return(list(cooks_plot = cooks_plot, - influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - cat("\n=== MODEL SUMMARY:", model_name, "===\n") - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - cat("\nR-squared:", format(summary_model$r.squared, digits = 5), "\n") - cat("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5), "\n") - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - cat("AIC:", format(aic_val, digits = 5), "\n") - cat("BIC:", format(bic_val, digits = 5), "\n") - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS FOR ALL MODELS -# ============================================================================= - -# Create results storage -assumption_results <- list() -model_summaries <- list() - -# Model names for reference -model_names <- c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global") - -cat("\n", rep("=", 80), "\n") -cat("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS\n") -cat(rep("=", 80), "\n") - -# Run assumption checks for each model -for (i in seq_along(models)) { - model_name <- model_names[i] - model <- models[[i]] - - cat("\n", rep("-", 60), "\n") - cat("ANALYZING MODEL", i, ":", model_name, "\n") - cat(rep("-", 60), "\n") - - # Store results - assumption_results[[i]] <- list() - assumption_results[[i]]$model_name <- model_name - - # 1. Model Summary - model_summaries[[i]] <- get_model_summary(model, model_name) - assumption_results[[i]]$summary <- model_summaries[[i]] - - # 2. Linearity Check - assumption_results[[i]]$linearity <- check_linearity(model, model_name) - - # 3. Normality Check - assumption_results[[i]]$normality <- check_normality(model, model_name) - - # 4. Homoscedasticity Check - assumption_results[[i]]$homoscedasticity <- check_homoscedasticity(model, model_name) - - # 5. Independence Check - assumption_results[[i]]$independence <- check_independence(model, model_name) - - # 6. Influence Check - assumption_results[[i]]$influence <- check_influence(model, model_name) -} - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -cat("\n", rep("=", 80), "\n") -cat("ASSUMPTION VIOLATION SUMMARY\n") -cat(rep("=", 80), "\n") - -# Create summary table -violation_summary <- data.frame( - Model = character(), - Linearity = character(), - Normality = character(), - Homoscedasticity = character(), - Independence = character(), - Influential_Obs = character(), - stringsAsFactors = FALSE -) - -# Populate summary table -for (i in seq_along(models)) { - model_name <- model_names[i] - - # Check normality (Shapiro-Wilk p < 0.05 indicates violation) - normality_violation <- ifelse(assumption_results[[i]]$normality$shapiro_p < 0.05, "VIOLATED", "OK") - - # Check homoscedasticity (Breusch-Pagan p < 0.05 indicates violation) - homosced_violation <- ifelse(assumption_results[[i]]$homoscedasticity$bp_p < 0.05, "VIOLATED", "OK") - - # Check independence (Durbin-Watson p < 0.05 indicates violation) - independence_violation <- ifelse(assumption_results[[i]]$independence$dw_p < 0.05, "VIOLATED", "OK") - - # Check for influential observations - influential_count <- length(assumption_results[[i]]$influence$influential_cooks) - influential_status <- ifelse(influential_count > 0, paste("YES (", influential_count, ")", sep = ""), "NO") - - # Linearity is assessed visually, so we'll mark as "CHECK VISUALLY" - linearity_status <- "CHECK VISUALLY" - - violation_summary <- rbind(violation_summary, data.frame( - Model = model_name, - Linearity = linearity_status, - Normality = normality_violation, - Homoscedasticity = homosced_violation, - Independence = independence_violation, - Influential_Obs = influential_status - )) -} - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -cat("\n", rep("=", 80), "\n") -cat("MODEL COMPARISON SUMMARY\n") -cat(rep("=", 80), "\n") - -# Create model comparison table -comparison_table <- data.frame( - Model = model_names, - R_Squared = numeric(length(models)), - Adj_R_Squared = numeric(length(models)), - AIC = numeric(length(models)), - BIC = numeric(length(models)), - Significant = character(length(models)), - stringsAsFactors = FALSE -) - -for (i in seq_along(models)) { - summary_model <- model_summaries[[i]]$summary - comparison_table$R_Squared[i] <- summary_model$r.squared - comparison_table$Adj_R_Squared[i] <- summary_model$adj.r.squared - comparison_table$AIC[i] <- model_summaries[[i]]$aic - comparison_table$BIC[i] <- model_summaries[[i]]$bic - - # Check if predictor is significant (p < 0.05) - p_value <- summary_model$coefficients[2, 4] - comparison_table$Significant[i] <- ifelse(p_value < 0.05, "YES", "NO") -} - -print(comparison_table) - -# ============================================================================= -# RECOMMENDATIONS -# ============================================================================= - -cat("\n", rep("=", 80), "\n") -cat("RECOMMENDATIONS FOR ASSUMPTION VIOLATIONS\n") -cat(rep("=", 80), "\n") - -cat("\n1. NORMALITY VIOLATIONS:\n") -cat(" - If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox)\n") -cat(" - Alternative: Use robust regression methods or bootstrapping\n") - -cat("\n2. HOMOSCEDASTICITY VIOLATIONS:\n") -cat(" - If violated: Use weighted least squares or robust standard errors\n") -cat(" - Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors\n") - -cat("\n3. INDEPENDENCE VIOLATIONS:\n") -cat(" - If violated: Check for clustering or repeated measures structure\n") -cat(" - Alternative: Use mixed-effects models or clustered standard errors\n") - -cat("\n4. INFLUENTIAL OBSERVATIONS:\n") -cat(" - If present: Examine these cases for data entry errors\n") -cat(" - Consider: Running analysis with and without influential cases\n") -cat(" - Alternative: Use robust regression methods\n") - -cat("\n5. LINEARITY VIOLATIONS:\n") -cat(" - If violated: Add polynomial terms or use splines\n") -cat(" - Alternative: Transform predictors or use non-parametric methods\n") - -cat("\n", rep("=", 80), "\n") -cat("ANALYSIS COMPLETE\n") -cat(rep("=", 80), "\n") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016142605.r b/.history/eohi1/regressions e1 - assumptions_20251016142605.r deleted file mode 100644 index 759d18b..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016142605.r +++ /dev/null @@ -1,401 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean -# Total: 6 regression models - -options(scipen = 999) - -# Load required libraries -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check data structure -cat("Data dimensions:", dim(data), "\n") -cat("Variables of interest:\n") -cat("IVs: demo_sex, demo_age, demo_edu\n") -cat("DVs: eohiDGEN_mean, ehi_global_mean\n\n") - -# Check for missing values -cat("Missing values check:\n") -missing_summary <- data %>% - select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -cat("\nClean data dimensions:", dim(data_clean), "\n") - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -cat("\nEducation levels:\n") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Verify recoding -cat("\nSex recoding (0=Female, 1=Male):\n") -print(table(data_clean$demo_sex_numeric)) -cat("\nEducation recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):\n") -print(table(data_clean$demo_edu_numeric)) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - cat("\n=== LINEARITY CHECK:", model_name, "===\n") - - # Residuals vs Fitted plot - residuals_vs_fitted <- plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - cr_plot <- crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - # Return plots for later use - return(list(residuals_vs_fitted = residuals_vs_fitted, cr_plot = cr_plot)) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - cat("\n=== NORMALITY CHECK:", model_name, "===\n") - - # Q-Q plot - qq_plot <- plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - cat("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5), "\n") - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - cat("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5), "\n") - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - - return(list(qq_plot = qq_plot, hist_plot = hist_plot, - shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - cat("\n=== HOMOSCEDASTICITY CHECK:", model_name, "===\n") - - # Scale-Location plot - scale_location_plot <- plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - cat("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5), "\n") - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - cat("White test p-value:", format(white_test$p.value, digits = 5), "\n") - }, error = function(e) { - cat("White test not available for this model\n") - }) - - return(list(scale_location_plot = scale_location_plot, bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - cat("\n=== INDEPENDENCE CHECK:", model_name, "===\n") - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - cat("Durbin-Watson statistic:", format(dw_test$dw, digits = 5), "\n") - cat("Durbin-Watson p-value:", format(dw_test$p, digits = 5), "\n") - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - - return(list(residuals_vs_order = residuals_vs_order, dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - cat("\n=== INFLUENCE CHECK:", model_name, "===\n") - - # Cook's Distance plot - cooks_plot <- plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - cat("Cook's Distance threshold:", format(cooks_threshold, digits = 5), "\n") - cat("Influential observations (Cook's D):", length(influential_cooks), "\n") - cat("Leverage threshold:", format(leverage_threshold, digits = 5), "\n") - cat("High leverage observations:", length(influential_leverage), "\n") - cat("DFFITS threshold:", format(dffits_threshold, digits = 5), "\n") - cat("Influential observations (DFFITS):", length(influential_dffits), "\n") - - if (length(influential_cooks) > 0) { - cat("Cook's D influential cases:", influential_cooks, "\n") - } - if (length(influential_leverage) > 0) { - cat("High leverage cases:", influential_leverage, "\n") - } - if (length(influential_dffits) > 0) { - cat("DFFITS influential cases:", influential_dffits, "\n") - } - - return(list(cooks_plot = cooks_plot, - influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - cat("\n=== MODEL SUMMARY:", model_name, "===\n") - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - cat("\nR-squared:", format(summary_model$r.squared, digits = 5), "\n") - cat("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5), "\n") - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - cat("AIC:", format(aic_val, digits = 5), "\n") - cat("BIC:", format(bic_val, digits = 5), "\n") - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS FOR ALL MODELS -# ============================================================================= - -# Create results storage -assumption_results <- list() -model_summaries <- list() - -# Model names for reference -model_names <- c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global") - -cat("\n", rep("=", 80), "\n") -cat("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS\n") -cat(rep("=", 80), "\n") - -# Run assumption checks for each model -for (i in seq_along(models)) { - model_name <- model_names[i] - model <- models[[i]] - - cat("\n", rep("-", 60), "\n") - cat("ANALYZING MODEL", i, ":", model_name, "\n") - cat(rep("-", 60), "\n") - - # Store results - assumption_results[[i]] <- list() - assumption_results[[i]]$model_name <- model_name - - # 1. Model Summary - model_summaries[[i]] <- get_model_summary(model, model_name) - assumption_results[[i]]$summary <- model_summaries[[i]] - - # 2. Linearity Check - assumption_results[[i]]$linearity <- check_linearity(model, model_name) - - # 3. Normality Check - assumption_results[[i]]$normality <- check_normality(model, model_name) - - # 4. Homoscedasticity Check - assumption_results[[i]]$homoscedasticity <- check_homoscedasticity(model, model_name) - - # 5. Independence Check - assumption_results[[i]]$independence <- check_independence(model, model_name) - - # 6. Influence Check - assumption_results[[i]]$influence <- check_influence(model, model_name) -} - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -cat("\n", rep("=", 80), "\n") -cat("ASSUMPTION VIOLATION SUMMARY\n") -cat(rep("=", 80), "\n") - -# Create summary table -violation_summary <- data.frame( - Model = character(), - Linearity = character(), - Normality = character(), - Homoscedasticity = character(), - Independence = character(), - Influential_Obs = character(), - stringsAsFactors = FALSE -) - -# Populate summary table -for (i in seq_along(models)) { - model_name <- model_names[i] - - # Check normality (Shapiro-Wilk p < 0.05 indicates violation) - normality_violation <- ifelse(assumption_results[[i]]$normality$shapiro_p < 0.05, "VIOLATED", "OK") - - # Check homoscedasticity (Breusch-Pagan p < 0.05 indicates violation) - homosced_violation <- ifelse(assumption_results[[i]]$homoscedasticity$bp_p < 0.05, "VIOLATED", "OK") - - # Check independence (Durbin-Watson p < 0.05 indicates violation) - independence_violation <- ifelse(assumption_results[[i]]$independence$dw_p < 0.05, "VIOLATED", "OK") - - # Check for influential observations - influential_count <- length(assumption_results[[i]]$influence$influential_cooks) - influential_status <- ifelse(influential_count > 0, paste("YES (", influential_count, ")", sep = ""), "NO") - - # Linearity is assessed visually, so we'll mark as "CHECK VISUALLY" - linearity_status <- "CHECK VISUALLY" - - violation_summary <- rbind(violation_summary, data.frame( - Model = model_name, - Linearity = linearity_status, - Normality = normality_violation, - Homoscedasticity = homosced_violation, - Independence = independence_violation, - Influential_Obs = influential_status - )) -} - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -cat("\n", rep("=", 80), "\n") -cat("MODEL COMPARISON SUMMARY\n") -cat(rep("=", 80), "\n") - -# Create model comparison table -comparison_table <- data.frame( - Model = model_names, - R_Squared = numeric(length(models)), - Adj_R_Squared = numeric(length(models)), - AIC = numeric(length(models)), - BIC = numeric(length(models)), - Significant = character(length(models)), - stringsAsFactors = FALSE -) - -for (i in seq_along(models)) { - summary_model <- model_summaries[[i]]$summary - comparison_table$R_Squared[i] <- summary_model$r.squared - comparison_table$Adj_R_Squared[i] <- summary_model$adj.r.squared - comparison_table$AIC[i] <- model_summaries[[i]]$aic - comparison_table$BIC[i] <- model_summaries[[i]]$bic - - # Check if predictor is significant (p < 0.05) - p_value <- summary_model$coefficients[2, 4] - comparison_table$Significant[i] <- ifelse(p_value < 0.05, "YES", "NO") -} - -print(comparison_table) - -# ============================================================================= -# RECOMMENDATIONS -# ============================================================================= - -cat("\n", rep("=", 80), "\n") -cat("RECOMMENDATIONS FOR ASSUMPTION VIOLATIONS\n") -cat(rep("=", 80), "\n") - -cat("\n1. NORMALITY VIOLATIONS:\n") -cat(" - If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox)\n") -cat(" - Alternative: Use robust regression methods or bootstrapping\n") - -cat("\n2. HOMOSCEDASTICITY VIOLATIONS:\n") -cat(" - If violated: Use weighted least squares or robust standard errors\n") -cat(" - Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors\n") - -cat("\n3. INDEPENDENCE VIOLATIONS:\n") -cat(" - If violated: Check for clustering or repeated measures structure\n") -cat(" - Alternative: Use mixed-effects models or clustered standard errors\n") - -cat("\n4. INFLUENTIAL OBSERVATIONS:\n") -cat(" - If present: Examine these cases for data entry errors\n") -cat(" - Consider: Running analysis with and without influential cases\n") -cat(" - Alternative: Use robust regression methods\n") - -cat("\n5. LINEARITY VIOLATIONS:\n") -cat(" - If violated: Add polynomial terms or use splines\n") -cat(" - Alternative: Transform predictors or use non-parametric methods\n") - -cat("\n", rep("=", 80), "\n") -cat("ANALYSIS COMPLETE\n") -cat(rep("=", 80), "\n") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016142612.r b/.history/eohi1/regressions e1 - assumptions_20251016142612.r deleted file mode 100644 index 759d18b..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016142612.r +++ /dev/null @@ -1,401 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean -# Total: 6 regression models - -options(scipen = 999) - -# Load required libraries -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check data structure -cat("Data dimensions:", dim(data), "\n") -cat("Variables of interest:\n") -cat("IVs: demo_sex, demo_age, demo_edu\n") -cat("DVs: eohiDGEN_mean, ehi_global_mean\n\n") - -# Check for missing values -cat("Missing values check:\n") -missing_summary <- data %>% - select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -cat("\nClean data dimensions:", dim(data_clean), "\n") - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -cat("\nEducation levels:\n") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Verify recoding -cat("\nSex recoding (0=Female, 1=Male):\n") -print(table(data_clean$demo_sex_numeric)) -cat("\nEducation recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):\n") -print(table(data_clean$demo_edu_numeric)) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - cat("\n=== LINEARITY CHECK:", model_name, "===\n") - - # Residuals vs Fitted plot - residuals_vs_fitted <- plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - cr_plot <- crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - # Return plots for later use - return(list(residuals_vs_fitted = residuals_vs_fitted, cr_plot = cr_plot)) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - cat("\n=== NORMALITY CHECK:", model_name, "===\n") - - # Q-Q plot - qq_plot <- plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - cat("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5), "\n") - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - cat("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5), "\n") - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - - return(list(qq_plot = qq_plot, hist_plot = hist_plot, - shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - cat("\n=== HOMOSCEDASTICITY CHECK:", model_name, "===\n") - - # Scale-Location plot - scale_location_plot <- plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - cat("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5), "\n") - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - cat("White test p-value:", format(white_test$p.value, digits = 5), "\n") - }, error = function(e) { - cat("White test not available for this model\n") - }) - - return(list(scale_location_plot = scale_location_plot, bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - cat("\n=== INDEPENDENCE CHECK:", model_name, "===\n") - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - cat("Durbin-Watson statistic:", format(dw_test$dw, digits = 5), "\n") - cat("Durbin-Watson p-value:", format(dw_test$p, digits = 5), "\n") - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - - return(list(residuals_vs_order = residuals_vs_order, dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - cat("\n=== INFLUENCE CHECK:", model_name, "===\n") - - # Cook's Distance plot - cooks_plot <- plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - cat("Cook's Distance threshold:", format(cooks_threshold, digits = 5), "\n") - cat("Influential observations (Cook's D):", length(influential_cooks), "\n") - cat("Leverage threshold:", format(leverage_threshold, digits = 5), "\n") - cat("High leverage observations:", length(influential_leverage), "\n") - cat("DFFITS threshold:", format(dffits_threshold, digits = 5), "\n") - cat("Influential observations (DFFITS):", length(influential_dffits), "\n") - - if (length(influential_cooks) > 0) { - cat("Cook's D influential cases:", influential_cooks, "\n") - } - if (length(influential_leverage) > 0) { - cat("High leverage cases:", influential_leverage, "\n") - } - if (length(influential_dffits) > 0) { - cat("DFFITS influential cases:", influential_dffits, "\n") - } - - return(list(cooks_plot = cooks_plot, - influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - cat("\n=== MODEL SUMMARY:", model_name, "===\n") - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - cat("\nR-squared:", format(summary_model$r.squared, digits = 5), "\n") - cat("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5), "\n") - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - cat("AIC:", format(aic_val, digits = 5), "\n") - cat("BIC:", format(bic_val, digits = 5), "\n") - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS FOR ALL MODELS -# ============================================================================= - -# Create results storage -assumption_results <- list() -model_summaries <- list() - -# Model names for reference -model_names <- c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global") - -cat("\n", rep("=", 80), "\n") -cat("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS\n") -cat(rep("=", 80), "\n") - -# Run assumption checks for each model -for (i in seq_along(models)) { - model_name <- model_names[i] - model <- models[[i]] - - cat("\n", rep("-", 60), "\n") - cat("ANALYZING MODEL", i, ":", model_name, "\n") - cat(rep("-", 60), "\n") - - # Store results - assumption_results[[i]] <- list() - assumption_results[[i]]$model_name <- model_name - - # 1. Model Summary - model_summaries[[i]] <- get_model_summary(model, model_name) - assumption_results[[i]]$summary <- model_summaries[[i]] - - # 2. Linearity Check - assumption_results[[i]]$linearity <- check_linearity(model, model_name) - - # 3. Normality Check - assumption_results[[i]]$normality <- check_normality(model, model_name) - - # 4. Homoscedasticity Check - assumption_results[[i]]$homoscedasticity <- check_homoscedasticity(model, model_name) - - # 5. Independence Check - assumption_results[[i]]$independence <- check_independence(model, model_name) - - # 6. Influence Check - assumption_results[[i]]$influence <- check_influence(model, model_name) -} - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -cat("\n", rep("=", 80), "\n") -cat("ASSUMPTION VIOLATION SUMMARY\n") -cat(rep("=", 80), "\n") - -# Create summary table -violation_summary <- data.frame( - Model = character(), - Linearity = character(), - Normality = character(), - Homoscedasticity = character(), - Independence = character(), - Influential_Obs = character(), - stringsAsFactors = FALSE -) - -# Populate summary table -for (i in seq_along(models)) { - model_name <- model_names[i] - - # Check normality (Shapiro-Wilk p < 0.05 indicates violation) - normality_violation <- ifelse(assumption_results[[i]]$normality$shapiro_p < 0.05, "VIOLATED", "OK") - - # Check homoscedasticity (Breusch-Pagan p < 0.05 indicates violation) - homosced_violation <- ifelse(assumption_results[[i]]$homoscedasticity$bp_p < 0.05, "VIOLATED", "OK") - - # Check independence (Durbin-Watson p < 0.05 indicates violation) - independence_violation <- ifelse(assumption_results[[i]]$independence$dw_p < 0.05, "VIOLATED", "OK") - - # Check for influential observations - influential_count <- length(assumption_results[[i]]$influence$influential_cooks) - influential_status <- ifelse(influential_count > 0, paste("YES (", influential_count, ")", sep = ""), "NO") - - # Linearity is assessed visually, so we'll mark as "CHECK VISUALLY" - linearity_status <- "CHECK VISUALLY" - - violation_summary <- rbind(violation_summary, data.frame( - Model = model_name, - Linearity = linearity_status, - Normality = normality_violation, - Homoscedasticity = homosced_violation, - Independence = independence_violation, - Influential_Obs = influential_status - )) -} - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -cat("\n", rep("=", 80), "\n") -cat("MODEL COMPARISON SUMMARY\n") -cat(rep("=", 80), "\n") - -# Create model comparison table -comparison_table <- data.frame( - Model = model_names, - R_Squared = numeric(length(models)), - Adj_R_Squared = numeric(length(models)), - AIC = numeric(length(models)), - BIC = numeric(length(models)), - Significant = character(length(models)), - stringsAsFactors = FALSE -) - -for (i in seq_along(models)) { - summary_model <- model_summaries[[i]]$summary - comparison_table$R_Squared[i] <- summary_model$r.squared - comparison_table$Adj_R_Squared[i] <- summary_model$adj.r.squared - comparison_table$AIC[i] <- model_summaries[[i]]$aic - comparison_table$BIC[i] <- model_summaries[[i]]$bic - - # Check if predictor is significant (p < 0.05) - p_value <- summary_model$coefficients[2, 4] - comparison_table$Significant[i] <- ifelse(p_value < 0.05, "YES", "NO") -} - -print(comparison_table) - -# ============================================================================= -# RECOMMENDATIONS -# ============================================================================= - -cat("\n", rep("=", 80), "\n") -cat("RECOMMENDATIONS FOR ASSUMPTION VIOLATIONS\n") -cat(rep("=", 80), "\n") - -cat("\n1. NORMALITY VIOLATIONS:\n") -cat(" - If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox)\n") -cat(" - Alternative: Use robust regression methods or bootstrapping\n") - -cat("\n2. HOMOSCEDASTICITY VIOLATIONS:\n") -cat(" - If violated: Use weighted least squares or robust standard errors\n") -cat(" - Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors\n") - -cat("\n3. INDEPENDENCE VIOLATIONS:\n") -cat(" - If violated: Check for clustering or repeated measures structure\n") -cat(" - Alternative: Use mixed-effects models or clustered standard errors\n") - -cat("\n4. INFLUENTIAL OBSERVATIONS:\n") -cat(" - If present: Examine these cases for data entry errors\n") -cat(" - Consider: Running analysis with and without influential cases\n") -cat(" - Alternative: Use robust regression methods\n") - -cat("\n5. LINEARITY VIOLATIONS:\n") -cat(" - If violated: Add polynomial terms or use splines\n") -cat(" - Alternative: Transform predictors or use non-parametric methods\n") - -cat("\n", rep("=", 80), "\n") -cat("ANALYSIS COMPLETE\n") -cat(rep("=", 80), "\n") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016142849.r b/.history/eohi1/regressions e1 - assumptions_20251016142849.r deleted file mode 100644 index 563f458..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016142849.r +++ /dev/null @@ -1,393 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - cat("\n=== LINEARITY CHECK:", model_name, "===\n") - - # Residuals vs Fitted plot - residuals_vs_fitted <- plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - cr_plot <- crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - # Return plots for later use - return(list(residuals_vs_fitted = residuals_vs_fitted, cr_plot = cr_plot)) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - cat("\n=== NORMALITY CHECK:", model_name, "===\n") - - # Q-Q plot - qq_plot <- plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - cat("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5), "\n") - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - cat("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5), "\n") - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - - return(list(qq_plot = qq_plot, hist_plot = hist_plot, - shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - cat("\n=== HOMOSCEDASTICITY CHECK:", model_name, "===\n") - - # Scale-Location plot - scale_location_plot <- plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - cat("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5), "\n") - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - cat("White test p-value:", format(white_test$p.value, digits = 5), "\n") - }, error = function(e) { - cat("White test not available for this model\n") - }) - - return(list(scale_location_plot = scale_location_plot, bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - cat("\n=== INDEPENDENCE CHECK:", model_name, "===\n") - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - cat("Durbin-Watson statistic:", format(dw_test$dw, digits = 5), "\n") - cat("Durbin-Watson p-value:", format(dw_test$p, digits = 5), "\n") - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - - return(list(residuals_vs_order = residuals_vs_order, dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - cat("\n=== INFLUENCE CHECK:", model_name, "===\n") - - # Cook's Distance plot - cooks_plot <- plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - cat("Cook's Distance threshold:", format(cooks_threshold, digits = 5), "\n") - cat("Influential observations (Cook's D):", length(influential_cooks), "\n") - cat("Leverage threshold:", format(leverage_threshold, digits = 5), "\n") - cat("High leverage observations:", length(influential_leverage), "\n") - cat("DFFITS threshold:", format(dffits_threshold, digits = 5), "\n") - cat("Influential observations (DFFITS):", length(influential_dffits), "\n") - - if (length(influential_cooks) > 0) { - cat("Cook's D influential cases:", influential_cooks, "\n") - } - if (length(influential_leverage) > 0) { - cat("High leverage cases:", influential_leverage, "\n") - } - if (length(influential_dffits) > 0) { - cat("DFFITS influential cases:", influential_dffits, "\n") - } - - return(list(cooks_plot = cooks_plot, - influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - cat("\n=== MODEL SUMMARY:", model_name, "===\n") - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - cat("\nR-squared:", format(summary_model$r.squared, digits = 5), "\n") - cat("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5), "\n") - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - cat("AIC:", format(aic_val, digits = 5), "\n") - cat("BIC:", format(bic_val, digits = 5), "\n") - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS FOR ALL MODELS -# ============================================================================= - -# Create results storage -assumption_results <- list() -model_summaries <- list() - -# Model names for reference -model_names <- c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global") - -cat("\n", rep("=", 80), "\n") -cat("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS\n") -cat(rep("=", 80), "\n") - -# Run assumption checks for each model -for (i in seq_along(models)) { - model_name <- model_names[i] - model <- models[[i]] - - cat("\n", rep("-", 60), "\n") - cat("ANALYZING MODEL", i, ":", model_name, "\n") - cat(rep("-", 60), "\n") - - # Store results - assumption_results[[i]] <- list() - assumption_results[[i]]$model_name <- model_name - - # 1. Model Summary - model_summaries[[i]] <- get_model_summary(model, model_name) - assumption_results[[i]]$summary <- model_summaries[[i]] - - # 2. Linearity Check - assumption_results[[i]]$linearity <- check_linearity(model, model_name) - - # 3. Normality Check - assumption_results[[i]]$normality <- check_normality(model, model_name) - - # 4. Homoscedasticity Check - assumption_results[[i]]$homoscedasticity <- check_homoscedasticity(model, model_name) - - # 5. Independence Check - assumption_results[[i]]$independence <- check_independence(model, model_name) - - # 6. Influence Check - assumption_results[[i]]$influence <- check_influence(model, model_name) -} - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -cat("\n", rep("=", 80), "\n") -cat("ASSUMPTION VIOLATION SUMMARY\n") -cat(rep("=", 80), "\n") - -# Create summary table -violation_summary <- data.frame( - Model = character(), - Linearity = character(), - Normality = character(), - Homoscedasticity = character(), - Independence = character(), - Influential_Obs = character(), - stringsAsFactors = FALSE -) - -# Populate summary table -for (i in seq_along(models)) { - model_name <- model_names[i] - - # Check normality (Shapiro-Wilk p < 0.05 indicates violation) - normality_violation <- ifelse(assumption_results[[i]]$normality$shapiro_p < 0.05, "VIOLATED", "OK") - - # Check homoscedasticity (Breusch-Pagan p < 0.05 indicates violation) - homosced_violation <- ifelse(assumption_results[[i]]$homoscedasticity$bp_p < 0.05, "VIOLATED", "OK") - - # Check independence (Durbin-Watson p < 0.05 indicates violation) - independence_violation <- ifelse(assumption_results[[i]]$independence$dw_p < 0.05, "VIOLATED", "OK") - - # Check for influential observations - influential_count <- length(assumption_results[[i]]$influence$influential_cooks) - influential_status <- ifelse(influential_count > 0, paste("YES (", influential_count, ")", sep = ""), "NO") - - # Linearity is assessed visually, so we'll mark as "CHECK VISUALLY" - linearity_status <- "CHECK VISUALLY" - - violation_summary <- rbind(violation_summary, data.frame( - Model = model_name, - Linearity = linearity_status, - Normality = normality_violation, - Homoscedasticity = homosced_violation, - Independence = independence_violation, - Influential_Obs = influential_status - )) -} - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -cat("\n", rep("=", 80), "\n") -cat("MODEL COMPARISON SUMMARY\n") -cat(rep("=", 80), "\n") - -# Create model comparison table -comparison_table <- data.frame( - Model = model_names, - R_Squared = numeric(length(models)), - Adj_R_Squared = numeric(length(models)), - AIC = numeric(length(models)), - BIC = numeric(length(models)), - Significant = character(length(models)), - stringsAsFactors = FALSE -) - -for (i in seq_along(models)) { - summary_model <- model_summaries[[i]]$summary - comparison_table$R_Squared[i] <- summary_model$r.squared - comparison_table$Adj_R_Squared[i] <- summary_model$adj.r.squared - comparison_table$AIC[i] <- model_summaries[[i]]$aic - comparison_table$BIC[i] <- model_summaries[[i]]$bic - - # Check if predictor is significant (p < 0.05) - p_value <- summary_model$coefficients[2, 4] - comparison_table$Significant[i] <- ifelse(p_value < 0.05, "YES", "NO") -} - -print(comparison_table) - -# ============================================================================= -# RECOMMENDATIONS -# ============================================================================= - -cat("\n", rep("=", 80), "\n") -cat("RECOMMENDATIONS FOR ASSUMPTION VIOLATIONS\n") -cat(rep("=", 80), "\n") - -cat("\n1. NORMALITY VIOLATIONS:\n") -cat(" - If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox)\n") -cat(" - Alternative: Use robust regression methods or bootstrapping\n") - -cat("\n2. HOMOSCEDASTICITY VIOLATIONS:\n") -cat(" - If violated: Use weighted least squares or robust standard errors\n") -cat(" - Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors\n") - -cat("\n3. INDEPENDENCE VIOLATIONS:\n") -cat(" - If violated: Check for clustering or repeated measures structure\n") -cat(" - Alternative: Use mixed-effects models or clustered standard errors\n") - -cat("\n4. INFLUENTIAL OBSERVATIONS:\n") -cat(" - If present: Examine these cases for data entry errors\n") -cat(" - Consider: Running analysis with and without influential cases\n") -cat(" - Alternative: Use robust regression methods\n") - -cat("\n5. LINEARITY VIOLATIONS:\n") -cat(" - If violated: Add polynomial terms or use splines\n") -cat(" - Alternative: Transform predictors or use non-parametric methods\n") - -cat("\n", rep("=", 80), "\n") -cat("ANALYSIS COMPLETE\n") -cat(rep("=", 80), "\n") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016142956.r b/.history/eohi1/regressions e1 - assumptions_20251016142956.r deleted file mode 100644 index 3c99586..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016142956.r +++ /dev/null @@ -1,392 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS FOR ALL MODELS -# ============================================================================= - -# Create results storage -assumption_results <- list() -model_summaries <- list() - -# Model names for reference -model_names <- c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global") - -cat("\n", rep("=", 80), "\n") -cat("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS\n") -cat(rep("=", 80), "\n") - -# Run assumption checks for each model -for (i in seq_along(models)) { - model_name <- model_names[i] - model <- models[[i]] - - cat("\n", rep("-", 60), "\n") - cat("ANALYZING MODEL", i, ":", model_name, "\n") - cat(rep("-", 60), "\n") - - # Store results - assumption_results[[i]] <- list() - assumption_results[[i]]$model_name <- model_name - - # 1. Model Summary - model_summaries[[i]] <- get_model_summary(model, model_name) - assumption_results[[i]]$summary <- model_summaries[[i]] - - # 2. Linearity Check - assumption_results[[i]]$linearity <- check_linearity(model, model_name) - - # 3. Normality Check - assumption_results[[i]]$normality <- check_normality(model, model_name) - - # 4. Homoscedasticity Check - assumption_results[[i]]$homoscedasticity <- check_homoscedasticity(model, model_name) - - # 5. Independence Check - assumption_results[[i]]$independence <- check_independence(model, model_name) - - # 6. Influence Check - assumption_results[[i]]$influence <- check_influence(model, model_name) -} - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -cat("\n", rep("=", 80), "\n") -cat("ASSUMPTION VIOLATION SUMMARY\n") -cat(rep("=", 80), "\n") - -# Create summary table -violation_summary <- data.frame( - Model = character(), - Linearity = character(), - Normality = character(), - Homoscedasticity = character(), - Independence = character(), - Influential_Obs = character(), - stringsAsFactors = FALSE -) - -# Populate summary table -for (i in seq_along(models)) { - model_name <- model_names[i] - - # Check normality (Shapiro-Wilk p < 0.05 indicates violation) - normality_violation <- ifelse(assumption_results[[i]]$normality$shapiro_p < 0.05, "VIOLATED", "OK") - - # Check homoscedasticity (Breusch-Pagan p < 0.05 indicates violation) - homosced_violation <- ifelse(assumption_results[[i]]$homoscedasticity$bp_p < 0.05, "VIOLATED", "OK") - - # Check independence (Durbin-Watson p < 0.05 indicates violation) - independence_violation <- ifelse(assumption_results[[i]]$independence$dw_p < 0.05, "VIOLATED", "OK") - - # Check for influential observations - influential_count <- length(assumption_results[[i]]$influence$influential_cooks) - influential_status <- ifelse(influential_count > 0, paste("YES (", influential_count, ")", sep = ""), "NO") - - # Linearity is assessed visually, so we'll mark as "CHECK VISUALLY" - linearity_status <- "CHECK VISUALLY" - - violation_summary <- rbind(violation_summary, data.frame( - Model = model_name, - Linearity = linearity_status, - Normality = normality_violation, - Homoscedasticity = homosced_violation, - Independence = independence_violation, - Influential_Obs = influential_status - )) -} - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -cat("\n", rep("=", 80), "\n") -cat("MODEL COMPARISON SUMMARY\n") -cat(rep("=", 80), "\n") - -# Create model comparison table -comparison_table <- data.frame( - Model = model_names, - R_Squared = numeric(length(models)), - Adj_R_Squared = numeric(length(models)), - AIC = numeric(length(models)), - BIC = numeric(length(models)), - Significant = character(length(models)), - stringsAsFactors = FALSE -) - -for (i in seq_along(models)) { - summary_model <- model_summaries[[i]]$summary - comparison_table$R_Squared[i] <- summary_model$r.squared - comparison_table$Adj_R_Squared[i] <- summary_model$adj.r.squared - comparison_table$AIC[i] <- model_summaries[[i]]$aic - comparison_table$BIC[i] <- model_summaries[[i]]$bic - - # Check if predictor is significant (p < 0.05) - p_value <- summary_model$coefficients[2, 4] - comparison_table$Significant[i] <- ifelse(p_value < 0.05, "YES", "NO") -} - -print(comparison_table) - -# ============================================================================= -# RECOMMENDATIONS -# ============================================================================= - -cat("\n", rep("=", 80), "\n") -cat("RECOMMENDATIONS FOR ASSUMPTION VIOLATIONS\n") -cat(rep("=", 80), "\n") - -cat("\n1. NORMALITY VIOLATIONS:\n") -cat(" - If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox)\n") -cat(" - Alternative: Use robust regression methods or bootstrapping\n") - -cat("\n2. HOMOSCEDASTICITY VIOLATIONS:\n") -cat(" - If violated: Use weighted least squares or robust standard errors\n") -cat(" - Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors\n") - -cat("\n3. INDEPENDENCE VIOLATIONS:\n") -cat(" - If violated: Check for clustering or repeated measures structure\n") -cat(" - Alternative: Use mixed-effects models or clustered standard errors\n") - -cat("\n4. INFLUENTIAL OBSERVATIONS:\n") -cat(" - If present: Examine these cases for data entry errors\n") -cat(" - Consider: Running analysis with and without influential cases\n") -cat(" - Alternative: Use robust regression methods\n") - -cat("\n5. LINEARITY VIOLATIONS:\n") -cat(" - If violated: Add polynomial terms or use splines\n") -cat(" - Alternative: Transform predictors or use non-parametric methods\n") - -cat("\n", rep("=", 80), "\n") -cat("ANALYSIS COMPLETE\n") -cat(rep("=", 80), "\n") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016143059.r b/.history/eohi1/regressions e1 - assumptions_20251016143059.r deleted file mode 100644 index 63e1b1c..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016143059.r +++ /dev/null @@ -1,397 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016143109.r b/.history/eohi1/regressions e1 - assumptions_20251016143109.r deleted file mode 100644 index 63e1b1c..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016143109.r +++ /dev/null @@ -1,397 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016143341.r b/.history/eohi1/regressions e1 - assumptions_20251016143341.r deleted file mode 100644 index 63e1b1c..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016143341.r +++ /dev/null @@ -1,397 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016143415.r b/.history/eohi1/regressions e1 - assumptions_20251016143415.r deleted file mode 100644 index 63e1b1c..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016143415.r +++ /dev/null @@ -1,397 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016143441.r b/.history/eohi1/regressions e1 - assumptions_20251016143441.r deleted file mode 100644 index 63e1b1c..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016143441.r +++ /dev/null @@ -1,397 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016144540.qmd b/.history/eohi1/regressions e1 - assumptions_20251016144540.qmd deleted file mode 100644 index 2b9244f..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016144540.qmd +++ /dev/null @@ -1,463 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true -execute: - echo: true - warning: false - message: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true - -# Load required libraries -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016144554.qmd b/.history/eohi1/regressions e1 - assumptions_20251016144554.qmd deleted file mode 100644 index 2b9244f..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016144554.qmd +++ /dev/null @@ -1,463 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true -execute: - echo: true - warning: false - message: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true - -# Load required libraries -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016144652.qmd b/.history/eohi1/regressions e1 - assumptions_20251016144652.qmd deleted file mode 100644 index 2b9244f..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016144652.qmd +++ /dev/null @@ -1,463 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true -execute: - echo: true - warning: false - message: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true - -# Load required libraries -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016145531.qmd b/.history/eohi1/regressions e1 - assumptions_20251016145531.qmd deleted file mode 100644 index f5e7a0b..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016145531.qmd +++ /dev/null @@ -1,463 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true -execute: - echo: true - warning: false - message: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016145539.qmd b/.history/eohi1/regressions e1 - assumptions_20251016145539.qmd deleted file mode 100644 index f5e7a0b..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016145539.qmd +++ /dev/null @@ -1,463 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true -execute: - echo: true - warning: false - message: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016145845.qmd b/.history/eohi1/regressions e1 - assumptions_20251016145845.qmd deleted file mode 100644 index f5e7a0b..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016145845.qmd +++ /dev/null @@ -1,463 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true -execute: - echo: true - warning: false - message: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016150135.qmd b/.history/eohi1/regressions e1 - assumptions_20251016150135.qmd deleted file mode 100644 index 3d30af9..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016150135.qmd +++ /dev/null @@ -1,463 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true -execute: - echo: true - warning: false - message: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016150355.qmd b/.history/eohi1/regressions e1 - assumptions_20251016150355.qmd deleted file mode 100644 index f098c5b..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016150355.qmd +++ /dev/null @@ -1,467 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true -execute: - echo: true - warning: false - message: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Create dummy variables for education (6 levels) -data_clean$edu_hs <- ifelse(data_clean$demo_edu == "High School (or equivalent)", 1, 0) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables:") -print("High School:", sum(data_clean$edu_hs)) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016150436.qmd b/.history/eohi1/regressions e1 - assumptions_20251016150436.qmd deleted file mode 100644 index f098c5b..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016150436.qmd +++ /dev/null @@ -1,467 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true -execute: - echo: true - warning: false - message: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Create dummy variables for education (6 levels) -data_clean$edu_hs <- ifelse(data_clean$demo_edu == "High School (or equivalent)", 1, 0) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables:") -print("High School:", sum(data_clean$edu_hs)) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016150455.qmd b/.history/eohi1/regressions e1 - assumptions_20251016150455.qmd deleted file mode 100644 index 89e657f..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016150455.qmd +++ /dev/null @@ -1,479 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true -execute: - echo: true - warning: false - message: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check if there are other education levels -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables if needed (adjust based on your actual data) -# Add more dummy variables for any additional education levels beyond the 4 we've coded - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016150502.qmd b/.history/eohi1/regressions e1 - assumptions_20251016150502.qmd deleted file mode 100644 index 335fb8a..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016150502.qmd +++ /dev/null @@ -1,479 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true -execute: - echo: true - warning: false - message: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check if there are other education levels -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables if needed (adjust based on your actual data) -# Add more dummy variables for any additional education levels beyond the 4 we've coded - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016150507.qmd b/.history/eohi1/regressions e1 - assumptions_20251016150507.qmd deleted file mode 100644 index 16f8f4c..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016150507.qmd +++ /dev/null @@ -1,479 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true -execute: - echo: true - warning: false - message: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check if there are other education levels -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables if needed (adjust based on your actual data) -# Add more dummy variables for any additional education levels beyond the 4 we've coded - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016150527.qmd b/.history/eohi1/regressions e1 - assumptions_20251016150527.qmd deleted file mode 100644 index 67eccf0..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016150527.qmd +++ /dev/null @@ -1,485 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true -execute: - echo: true - warning: false - message: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016150545.qmd b/.history/eohi1/regressions e1 - assumptions_20251016150545.qmd deleted file mode 100644 index 67eccf0..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016150545.qmd +++ /dev/null @@ -1,485 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true -execute: - echo: true - warning: false - message: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016150558.qmd b/.history/eohi1/regressions e1 - assumptions_20251016150558.qmd deleted file mode 100644 index 67eccf0..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016150558.qmd +++ /dev/null @@ -1,485 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true -execute: - echo: true - warning: false - message: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016154019.qmd b/.history/eohi1/regressions e1 - assumptions_20251016154019.qmd deleted file mode 100644 index ef9b009..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016154019.qmd +++ /dev/null @@ -1,489 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016154024.qmd b/.history/eohi1/regressions e1 - assumptions_20251016154024.qmd deleted file mode 100644 index b8362ec..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016154024.qmd +++ /dev/null @@ -1,490 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true -#| output: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016154032.qmd b/.history/eohi1/regressions e1 - assumptions_20251016154032.qmd deleted file mode 100644 index b19fd37..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016154032.qmd +++ /dev/null @@ -1,491 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true -#| output: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016154037.qmd b/.history/eohi1/regressions e1 - assumptions_20251016154037.qmd deleted file mode 100644 index ff233ba..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016154037.qmd +++ /dev/null @@ -1,492 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true -#| output: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016154047.qmd b/.history/eohi1/regressions e1 - assumptions_20251016154047.qmd deleted file mode 100644 index ff233ba..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016154047.qmd +++ /dev/null @@ -1,492 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true -#| output: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016154106.qmd b/.history/eohi1/regressions e1 - assumptions_20251016154106.qmd deleted file mode 100644 index ff233ba..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016154106.qmd +++ /dev/null @@ -1,492 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true -#| output: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016154110.qmd b/.history/eohi1/regressions e1 - assumptions_20251016154110.qmd deleted file mode 100644 index ff233ba..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016154110.qmd +++ /dev/null @@ -1,492 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true -#| output: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016154202.qmd b/.history/eohi1/regressions e1 - assumptions_20251016154202.qmd deleted file mode 100644 index 40dfbcd..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016154202.qmd +++ /dev/null @@ -1,494 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true - results: 'markup' - cache: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true -#| output: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016154210.qmd b/.history/eohi1/regressions e1 - assumptions_20251016154210.qmd deleted file mode 100644 index a8af071..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016154210.qmd +++ /dev/null @@ -1,495 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true - results: 'markup' - cache: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true -#| output: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -cat("Missing values check:\n") -print(missing_summary) -cat("\n") - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016154217.qmd b/.history/eohi1/regressions e1 - assumptions_20251016154217.qmd deleted file mode 100644 index a8af071..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016154217.qmd +++ /dev/null @@ -1,495 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true - results: 'markup' - cache: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true -#| output: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -cat("Missing values check:\n") -print(missing_summary) -cat("\n") - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016154250.qmd b/.history/eohi1/regressions e1 - assumptions_20251016154250.qmd deleted file mode 100644 index a8af071..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016154250.qmd +++ /dev/null @@ -1,495 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true - results: 'markup' - cache: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true -#| output: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(car) -library(performance) -library(see) -library(ggplot2) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -cat("Missing values check:\n") -print(missing_summary) -cat("\n") - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016154501.qmd b/.history/eohi1/regressions e1 - assumptions_20251016154501.qmd deleted file mode 100644 index 6811562..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016154501.qmd +++ /dev/null @@ -1,497 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true - results: 'markup' - cache: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true -#| output: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(ggplot2) # Load ggplot2 first to avoid version conflicts -library(car) -library(performance) -library(see) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -print("test message") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -cat("Missing values check:\n") -print(missing_summary) -cat("\n") - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016154514.qmd b/.history/eohi1/regressions e1 - assumptions_20251016154514.qmd deleted file mode 100644 index 6811562..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016154514.qmd +++ /dev/null @@ -1,497 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true - results: 'markup' - cache: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true -#| output: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(ggplot2) # Load ggplot2 first to avoid version conflicts -library(car) -library(performance) -library(see) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -print("test message") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -cat("Missing values check:\n") -print(missing_summary) -cat("\n") - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016154524.qmd b/.history/eohi1/regressions e1 - assumptions_20251016154524.qmd deleted file mode 100644 index 6811562..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016154524.qmd +++ /dev/null @@ -1,497 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true - results: 'markup' - cache: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true -#| output: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(ggplot2) # Load ggplot2 first to avoid version conflicts -library(car) -library(performance) -library(see) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -print("test message") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -cat("Missing values check:\n") -print(missing_summary) -cat("\n") - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016154558.qmd b/.history/eohi1/regressions e1 - assumptions_20251016154558.qmd deleted file mode 100644 index 253c964..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016154558.qmd +++ /dev/null @@ -1,502 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true - results: 'markup' - cache: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true -#| output: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(ggplot2) # Load ggplot2 first to avoid version conflicts -library(car) -library(performance) -library(see) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -print("test message") - -# Test chunk execution -cat("This should appear when you run this chunk\n") -print(paste("Current time:", Sys.time())) -cat("If you see this output below this chunk, Quarto is working correctly!\n") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -cat("Missing values check:\n") -print(missing_summary) -cat("\n") - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016154609.qmd b/.history/eohi1/regressions e1 - assumptions_20251016154609.qmd deleted file mode 100644 index 253c964..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016154609.qmd +++ /dev/null @@ -1,502 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true - results: 'markup' - cache: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true -#| output: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(ggplot2) # Load ggplot2 first to avoid version conflicts -library(car) -library(performance) -library(see) -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -print("test message") - -# Test chunk execution -cat("This should appear when you run this chunk\n") -print(paste("Current time:", Sys.time())) -cat("If you see this output below this chunk, Quarto is working correctly!\n") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -cat("Missing values check:\n") -print(missing_summary) -cat("\n") - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016154622.qmd b/.history/eohi1/regressions e1 - assumptions_20251016154622.qmd deleted file mode 100644 index 597b24d..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016154622.qmd +++ /dev/null @@ -1,502 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true - results: 'markup' - cache: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true -#| output: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(ggplot2) # Load ggplot2 first to avoid version conflicts -library(car) -library(performance) -# library(see) # Temporarily commented out due to version conflict -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -print("test message") - -# Test chunk execution -cat("This should appear when you run this chunk\n") -print(paste("Current time:", Sys.time())) -cat("If you see this output below this chunk, Quarto is working correctly!\n") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -cat("Missing values check:\n") -print(missing_summary) -cat("\n") - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016154628.qmd b/.history/eohi1/regressions e1 - assumptions_20251016154628.qmd deleted file mode 100644 index 597b24d..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016154628.qmd +++ /dev/null @@ -1,502 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true - results: 'markup' - cache: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true -#| output: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(ggplot2) # Load ggplot2 first to avoid version conflicts -library(car) -library(performance) -# library(see) # Temporarily commented out due to version conflict -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -print("test message") - -# Test chunk execution -cat("This should appear when you run this chunk\n") -print(paste("Current time:", Sys.time())) -cat("If you see this output below this chunk, Quarto is working correctly!\n") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -cat("Missing values check:\n") -print(missing_summary) -cat("\n") - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016154636.qmd b/.history/eohi1/regressions e1 - assumptions_20251016154636.qmd deleted file mode 100644 index 597b24d..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016154636.qmd +++ /dev/null @@ -1,502 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true - results: 'markup' - cache: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true -#| output: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(ggplot2) # Load ggplot2 first to avoid version conflicts -library(car) -library(performance) -# library(see) # Temporarily commented out due to version conflict -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -print("test message") - -# Test chunk execution -cat("This should appear when you run this chunk\n") -print(paste("Current time:", Sys.time())) -cat("If you see this output below this chunk, Quarto is working correctly!\n") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -cat("Missing values check:\n") -print(missing_summary) -cat("\n") - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016154647.qmd b/.history/eohi1/regressions e1 - assumptions_20251016154647.qmd deleted file mode 100644 index 597b24d..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016154647.qmd +++ /dev/null @@ -1,502 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true - results: 'markup' - cache: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true -#| output: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(ggplot2) # Load ggplot2 first to avoid version conflicts -library(car) -library(performance) -# library(see) # Temporarily commented out due to version conflict -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -print("test message") - -# Test chunk execution -cat("This should appear when you run this chunk\n") -print(paste("Current time:", Sys.time())) -cat("If you see this output below this chunk, Quarto is working correctly!\n") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -cat("Missing values check:\n") -print(missing_summary) -cat("\n") - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016154910.qmd b/.history/eohi1/regressions e1 - assumptions_20251016154910.qmd deleted file mode 100644 index ab9db88..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016154910.qmd +++ /dev/null @@ -1,504 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true - results: 'markup' - cache: false ---- - -## Setup and Data Preparation - -```{r setup} -#| label: setup -#| echo: true -#| output: true -#| results: 'markup' -#| eval: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(ggplot2) # Load ggplot2 first to avoid version conflicts -library(car) -library(performance) -# library(see) # Temporarily commented out due to version conflict -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -print("test message") - -# Test chunk execution -cat("This should appear when you run this chunk\n") -print(paste("Current time:", Sys.time())) -cat("If you see this output below this chunk, Quarto is working correctly!\n") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -cat("Missing values check:\n") -print(missing_summary) -cat("\n") - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016154925.qmd b/.history/eohi1/regressions e1 - assumptions_20251016154925.qmd deleted file mode 100644 index 3735fd4..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016154925.qmd +++ /dev/null @@ -1,516 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true - results: 'markup' - cache: false ---- - -## Setup and Data Preparation - -```{r simple-test} -#| echo: true -#| output: true -#| results: 'markup' - -# Simple test to verify Quarto is working -cat("=== QUARTO TEST ===\n") -print("If you see this, Quarto is working!") -2 + 2 -cat("=== END TEST ===\n") -``` - -```{r setup} -#| label: setup -#| echo: true -#| output: true -#| results: 'markup' -#| eval: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(ggplot2) # Load ggplot2 first to avoid version conflicts -library(car) -library(performance) -# library(see) # Temporarily commented out due to version conflict -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -print("test message") - -# Test chunk execution -cat("This should appear when you run this chunk\n") -print(paste("Current time:", Sys.time())) -cat("If you see this output below this chunk, Quarto is working correctly!\n") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -cat("Missing values check:\n") -print(missing_summary) -cat("\n") - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016154946.qmd b/.history/eohi1/regressions e1 - assumptions_20251016154946.qmd deleted file mode 100644 index 3735fd4..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016154946.qmd +++ /dev/null @@ -1,516 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true - results: 'markup' - cache: false ---- - -## Setup and Data Preparation - -```{r simple-test} -#| echo: true -#| output: true -#| results: 'markup' - -# Simple test to verify Quarto is working -cat("=== QUARTO TEST ===\n") -print("If you see this, Quarto is working!") -2 + 2 -cat("=== END TEST ===\n") -``` - -```{r setup} -#| label: setup -#| echo: true -#| output: true -#| results: 'markup' -#| eval: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(ggplot2) # Load ggplot2 first to avoid version conflicts -library(car) -library(performance) -# library(see) # Temporarily commented out due to version conflict -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -print("test message") - -# Test chunk execution -cat("This should appear when you run this chunk\n") -print(paste("Current time:", Sys.time())) -cat("If you see this output below this chunk, Quarto is working correctly!\n") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -cat("Missing values check:\n") -print(missing_summary) -cat("\n") - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016155002.qmd b/.history/eohi1/regressions e1 - assumptions_20251016155002.qmd deleted file mode 100644 index 3735fd4..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016155002.qmd +++ /dev/null @@ -1,516 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true - results: 'markup' - cache: false ---- - -## Setup and Data Preparation - -```{r simple-test} -#| echo: true -#| output: true -#| results: 'markup' - -# Simple test to verify Quarto is working -cat("=== QUARTO TEST ===\n") -print("If you see this, Quarto is working!") -2 + 2 -cat("=== END TEST ===\n") -``` - -```{r setup} -#| label: setup -#| echo: true -#| output: true -#| results: 'markup' -#| eval: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(ggplot2) # Load ggplot2 first to avoid version conflicts -library(car) -library(performance) -# library(see) # Temporarily commented out due to version conflict -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -print("test message") - -# Test chunk execution -cat("This should appear when you run this chunk\n") -print(paste("Current time:", Sys.time())) -cat("If you see this output below this chunk, Quarto is working correctly!\n") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -cat("Missing values check:\n") -print(missing_summary) -cat("\n") - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016155036.qmd b/.history/eohi1/regressions e1 - assumptions_20251016155036.qmd deleted file mode 100644 index 768a4ae..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016155036.qmd +++ /dev/null @@ -1,518 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true - results: 'markup' - cache: false ---- - -## Setup and Data Preparation - -```{r simple-test} -#| echo: true -#| output: true -#| results: 'markup' -#| eval: true -#| include: true - -# Simple test to verify Quarto is working -cat("=== QUARTO TEST ===\n") -print("If you see this, Quarto is working!") -2 + 2 -cat("=== END TEST ===\n") -``` - -```{r setup} -#| label: setup -#| echo: true -#| output: true -#| results: 'markup' -#| eval: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(ggplot2) # Load ggplot2 first to avoid version conflicts -library(car) -library(performance) -# library(see) # Temporarily commented out due to version conflict -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -print("test message") - -# Test chunk execution -cat("This should appear when you run this chunk\n") -print(paste("Current time:", Sys.time())) -cat("If you see this output below this chunk, Quarto is working correctly!\n") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -cat("Missing values check:\n") -print(missing_summary) -cat("\n") - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016155057.qmd b/.history/eohi1/regressions e1 - assumptions_20251016155057.qmd deleted file mode 100644 index 768a4ae..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016155057.qmd +++ /dev/null @@ -1,518 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true - results: 'markup' - cache: false ---- - -## Setup and Data Preparation - -```{r simple-test} -#| echo: true -#| output: true -#| results: 'markup' -#| eval: true -#| include: true - -# Simple test to verify Quarto is working -cat("=== QUARTO TEST ===\n") -print("If you see this, Quarto is working!") -2 + 2 -cat("=== END TEST ===\n") -``` - -```{r setup} -#| label: setup -#| echo: true -#| output: true -#| results: 'markup' -#| eval: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(ggplot2) # Load ggplot2 first to avoid version conflicts -library(car) -library(performance) -# library(see) # Temporarily commented out due to version conflict -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -print("test message") - -# Test chunk execution -cat("This should appear when you run this chunk\n") -print(paste("Current time:", Sys.time())) -cat("If you see this output below this chunk, Quarto is working correctly!\n") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -cat("Missing values check:\n") -print(missing_summary) -cat("\n") - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016155747.qmd b/.history/eohi1/regressions e1 - assumptions_20251016155747.qmd deleted file mode 100644 index 768a4ae..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016155747.qmd +++ /dev/null @@ -1,518 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true - results: 'markup' - cache: false ---- - -## Setup and Data Preparation - -```{r simple-test} -#| echo: true -#| output: true -#| results: 'markup' -#| eval: true -#| include: true - -# Simple test to verify Quarto is working -cat("=== QUARTO TEST ===\n") -print("If you see this, Quarto is working!") -2 + 2 -cat("=== END TEST ===\n") -``` - -```{r setup} -#| label: setup -#| echo: true -#| output: true -#| results: 'markup' -#| eval: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(ggplot2) # Load ggplot2 first to avoid version conflicts -library(car) -library(performance) -# library(see) # Temporarily commented out due to version conflict -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -print("test message") - -# Test chunk execution -cat("This should appear when you run this chunk\n") -print(paste("Current time:", Sys.time())) -cat("If you see this output below this chunk, Quarto is working correctly!\n") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -cat("Missing values check:\n") -print(missing_summary) -cat("\n") - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016161118.qmd b/.history/eohi1/regressions e1 - assumptions_20251016161118.qmd deleted file mode 100644 index 7ef393f..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016161118.qmd +++ /dev/null @@ -1,519 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true - results: 'markup' - cache: false - freeze: false ---- - -## Setup and Data Preparation - -```{r simple-test} -#| echo: true -#| output: true -#| results: 'markup' -#| eval: true -#| include: true - -# Simple test to verify Quarto is working -cat("=== QUARTO TEST ===\n") -print("If you see this, Quarto is working!") -2 + 2 -cat("=== END TEST ===\n") -``` - -```{r setup} -#| label: setup -#| echo: true -#| output: true -#| results: 'markup' -#| eval: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(ggplot2) # Load ggplot2 first to avoid version conflicts -library(car) -library(performance) -# library(see) # Temporarily commented out due to version conflict -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -print("test message") - -# Test chunk execution -cat("This should appear when you run this chunk\n") -print(paste("Current time:", Sys.time())) -cat("If you see this output below this chunk, Quarto is working correctly!\n") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -cat("Missing values check:\n") -print(missing_summary) -cat("\n") - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016161133.qmd b/.history/eohi1/regressions e1 - assumptions_20251016161133.qmd deleted file mode 100644 index 438c056..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016161133.qmd +++ /dev/null @@ -1,517 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true - results: 'markup' - cache: false - freeze: false ---- - -## Setup and Data Preparation - -```{r} -#| label: simple-test -#| echo: true -#| output: true - -# Simple test to verify Quarto is working -cat("=== QUARTO TEST ===\n") -print("If you see this, Quarto is working!") -2 + 2 -cat("=== END TEST ===\n") -``` - -```{r setup} -#| label: setup -#| echo: true -#| output: true -#| results: 'markup' -#| eval: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(ggplot2) # Load ggplot2 first to avoid version conflicts -library(car) -library(performance) -# library(see) # Temporarily commented out due to version conflict -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -print("test message") - -# Test chunk execution -cat("This should appear when you run this chunk\n") -print(paste("Current time:", Sys.time())) -cat("If you see this output below this chunk, Quarto is working correctly!\n") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -cat("Missing values check:\n") -print(missing_summary) -cat("\n") - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016161154.qmd b/.history/eohi1/regressions e1 - assumptions_20251016161154.qmd deleted file mode 100644 index 438c056..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016161154.qmd +++ /dev/null @@ -1,517 +0,0 @@ ---- -title: "Regression Analysis - Assumption Checking" -subtitle: "IVs: demo_sex, demo_age, demo_edu | DVs: eohiDGEN_mean, ehi_global_mean" -author: "Irina" -date: today -format: - html: - theme: cosmo - toc: true - toc-depth: 3 - code-fold: false - code-tools: true - fig-width: 8 - fig-height: 6 -execute: - echo: true - warning: false - message: false - eval: true - output: true - results: 'markup' - cache: false - freeze: false ---- - -## Setup and Data Preparation - -```{r} -#| label: simple-test -#| echo: true -#| output: true - -# Simple test to verify Quarto is working -cat("=== QUARTO TEST ===\n") -print("If you see this, Quarto is working!") -2 + 2 -cat("=== END TEST ===\n") -``` - -```{r setup} -#| label: setup -#| echo: true -#| output: true -#| results: 'markup' -#| eval: true - -# Load required libraries -library(dplyr) # Must load first for %>% operator -library(ggplot2) # Load ggplot2 first to avoid version conflicts -library(car) -library(performance) -# library(see) # Temporarily commented out due to version conflict -library(gridExtra) -library(lmtest) # For bptest and durbinWatsonTest - -# Set options -options(scipen = 999) - -# Set working directory and load data -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -print("test message") - -# Test chunk execution -cat("This should appear when you run this chunk\n") -print(paste("Current time:", Sys.time())) -cat("If you see this output below this chunk, Quarto is working correctly!\n") -``` - -```{r data-prep} -#| label: data-prep -#| echo: true -#| output: true - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) - -cat("Missing values check:\n") -print(missing_summary) -cat("\n") - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and create dummy variables -print("Education levels:") -edu_table <- table(data_clean$demo_edu) -print(edu_table) -print(paste("Number of education levels:", length(unique(data_clean$demo_edu)))) - -# Create dummy variables for education (k-1 coding: 7 levels = 6 dummy variables) -# Using High School as reference category (excluded dummy) -data_clean$edu_college <- ifelse(data_clean$demo_edu == "College Diploma/Certificate", 1, 0) -data_clean$edu_undergrad <- ifelse(data_clean$demo_edu == "University - Undergraduate", 1, 0) -data_clean$edu_grad <- ifelse(data_clean$demo_edu == "University - Graduate", 1, 0) - -# Check what other education levels exist and create additional dummies -edu_levels <- unique(data_clean$demo_edu) -print("All education levels found:") -for(i in 1:length(edu_levels)) { - print(paste(i, ":", edu_levels[i])) -} - -# Create additional dummy variables for other education levels -# (You'll need to adjust these based on your actual data) -# Example for additional levels - adjust names and conditions as needed: -# data_clean$edu_other1 <- ifelse(data_clean$demo_edu == "Other Level 1", 1, 0) -# data_clean$edu_other2 <- ifelse(data_clean$demo_edu == "Other Level 2", 1, 0) -# data_clean$edu_other3 <- ifelse(data_clean$demo_edu == "Other Level 3", 1, 0) - -# Note: Once you identify all 7 levels, create 6 dummy variables total (k-1) - -# Verify dummy coding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables (k-1 coding with High School as reference):") -print("High School (reference):", sum(data_clean$demo_edu == "High School (or equivalent)")) -print("College:", sum(data_clean$edu_college)) -print("Undergraduate:", sum(data_clean$edu_undergrad)) -print("Graduate:", sum(data_clean$edu_grad)) -``` - -## Regression Models - -```{r models} -#| label: models -#| echo: true -#| output: true - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: education dummies → eohiDGEN_mean (k-1 coding, HS as reference) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: education dummies → ehi_global_mean (k-1 coding, HS as reference) -models$edu_ehi_global <- lm(ehi_global_mean ~ edu_college + edu_undergrad + edu_grad, data = data_clean) -``` - -## Assumption Checking Functions - -```{r functions} -#| label: functions -#| echo: true - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} -``` - -## Model 1: Sex → EOHI-DGEN Mean - -```{r model1} -#| label: model1 -#| echo: true - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -``` - -## Model 2: Age → EOHI-DGEN Mean - -```{r model2} -#| label: model2 -#| echo: true - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") -``` - -## Model 3: Education → EOHI-DGEN Mean - -```{r model3} -#| label: model3 -#| echo: true - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -``` - -## Model 4: Sex → EHI-Global Mean - -```{r model4} -#| label: model4 -#| echo: true - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") -``` - -## Model 5: Age → EHI-Global Mean - -```{r model5} -#| label: model5 -#| echo: true - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") -``` - -## Model 6: Education → EHI-Global Mean - -```{r model6} -#| label: model6 -#| echo: true - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") -``` - -## Summary Tables - -### Assumption Violation Summary - -```{r violation-summary} -#| label: violation-summary -#| echo: true - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) -``` - -### Model Comparison Summary - -```{r comparison-summary} -#| label: comparison-summary -#| echo: true - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) -``` - -## Recommendations - -### For Assumption Violations: - -**1. Normality Violations:** -- If violated: Consider transforming the dependent variable (log, sqrt, Box-Cox) -- Alternative: Use robust regression methods or bootstrapping - -**2. Homoscedasticity Violations:** -- If violated: Use weighted least squares or robust standard errors -- Alternative: Transform the dependent variable or use heteroscedasticity-consistent standard errors - -**3. Independence Violations:** -- If violated: Check for clustering or repeated measures structure -- Alternative: Use mixed-effects models or clustered standard errors - -**4. Influential Observations:** -- If present: Examine these cases for data entry errors -- Consider: Running analysis with and without influential cases -- Alternative: Use robust regression methods - -**5. Linearity Violations:** -- If violated: Add polynomial terms or use splines -- Alternative: Transform predictors or use non-parametric methods - ---- - -*Analysis completed for 6 regression models examining the relationship between demographic variables (sex, age, education) and EOHI measures.* diff --git a/.history/eohi1/regressions e1 - assumptions_20251016173803.r b/.history/eohi1/regressions e1 - assumptions_20251016173803.r deleted file mode 100644 index f647e9c..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016173803.r +++ /dev/null @@ -1,398 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest -library(rmarkdown) # For HTML rendering - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016173806.r b/.history/eohi1/regressions e1 - assumptions_20251016173806.r deleted file mode 100644 index b1c6d34..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016173806.r +++ /dev/null @@ -1,411 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest -library(rmarkdown) # For HTML rendering - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") - -# ============================================================================= -# RENDER TO HTML -# ============================================================================= - -# Method 1: Render the current R script to HTML -# This will create an HTML file with all outputs, plots, and results -rmarkdown::render("regressions e1 - assumptions.r", - output_format = "html_document", - output_file = "regressions_e1_assumptions.html", - output_dir = ".") - -print("HTML file created: regressions_e1_assumptions.html") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016173814.r b/.history/eohi1/regressions e1 - assumptions_20251016173814.r deleted file mode 100644 index b1c6d34..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016173814.r +++ /dev/null @@ -1,411 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest -library(rmarkdown) # For HTML rendering - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") - -# ============================================================================= -# RENDER TO HTML -# ============================================================================= - -# Method 1: Render the current R script to HTML -# This will create an HTML file with all outputs, plots, and results -rmarkdown::render("regressions e1 - assumptions.r", - output_format = "html_document", - output_file = "regressions_e1_assumptions.html", - output_dir = ".") - -print("HTML file created: regressions_e1_assumptions.html") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016173817.r b/.history/eohi1/regressions e1 - assumptions_20251016173817.r deleted file mode 100644 index be00f2a..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016173817.r +++ /dev/null @@ -1,397 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174209.r b/.history/eohi1/regressions e1 - assumptions_20251016174209.r deleted file mode 100644 index 4dcb7e9..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174209.r +++ /dev/null @@ -1,397 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174256.r b/.history/eohi1/regressions e1 - assumptions_20251016174256.r deleted file mode 100644 index bf40217..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174256.r +++ /dev/null @@ -1,403 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) -print("Age variable centered:") -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age → eohiDGEN_mean -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174304.r b/.history/eohi1/regressions e1 - assumptions_20251016174304.r deleted file mode 100644 index b829ba1..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174304.r +++ /dev/null @@ -1,403 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) -print("Age variable centered:") -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age → ehi_global_mean -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174308.r b/.history/eohi1/regressions e1 - assumptions_20251016174308.r deleted file mode 100644 index b28df06..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174308.r +++ /dev/null @@ -1,403 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) -print("Age variable centered:") -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174332.r b/.history/eohi1/regressions e1 - assumptions_20251016174332.r deleted file mode 100644 index 6055151..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174332.r +++ /dev/null @@ -1,403 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) -print("Age variable centered:") -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age (centered) → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174341.r b/.history/eohi1/regressions e1 - assumptions_20251016174341.r deleted file mode 100644 index bc8b49d..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174341.r +++ /dev/null @@ -1,403 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) -print("Age variable centered:") -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age (centered) → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age (centered) → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174344.r b/.history/eohi1/regressions e1 - assumptions_20251016174344.r deleted file mode 100644 index 21858aa..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174344.r +++ /dev/null @@ -1,403 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) -print("Age variable centered:") -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age (centered) → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age (centered) → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174349.r b/.history/eohi1/regressions e1 - assumptions_20251016174349.r deleted file mode 100644 index 35e8c92..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174349.r +++ /dev/null @@ -1,403 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) -print("Age variable centered:") -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age (centered) → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age (centered) → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174357.r b/.history/eohi1/regressions e1 - assumptions_20251016174357.r deleted file mode 100644 index 35e8c92..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174357.r +++ /dev/null @@ -1,403 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) -print("Age variable centered:") -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age (centered) → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age (centered) → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174402.r b/.history/eohi1/regressions e1 - assumptions_20251016174402.r deleted file mode 100644 index 35e8c92..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174402.r +++ /dev/null @@ -1,403 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print("Missing values check:") -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) -print("Age variable centered:") -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age (centered) → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age (centered) → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174430.r b/.history/eohi1/regressions e1 - assumptions_20251016174430.r deleted file mode 100644 index f8d6f2e..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174430.r +++ /dev/null @@ -1,402 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Education levels:") -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) -print("Age variable centered:") -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age (centered) → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age (centered) → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174433.r b/.history/eohi1/regressions e1 - assumptions_20251016174433.r deleted file mode 100644 index ce6ea8d..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174433.r +++ /dev/null @@ -1,401 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education recoding (1=HS, 2=College, 3=Undergrad, 4=Grad):") -print(table(data_clean$demo_edu_numeric)) -print("Age variable centered:") -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age (centered) → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age (centered) → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174436.r b/.history/eohi1/regressions e1 - assumptions_20251016174436.r deleted file mode 100644 index b5bbe19..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174436.r +++ /dev/null @@ -1,398 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print(table(data_clean$demo_sex_numeric)) -print(table(data_clean$demo_edu_numeric)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - print(paste("=== LINEARITY CHECK:", model_name, "===")) - - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age (centered) → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age (centered) → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174439.r b/.history/eohi1/regressions e1 - assumptions_20251016174439.r deleted file mode 100644 index dbd7c99..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174439.r +++ /dev/null @@ -1,396 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print(table(data_clean$demo_sex_numeric)) -print(table(data_clean$demo_edu_numeric)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - print(paste("=== NORMALITY CHECK:", model_name, "===")) - - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age (centered) → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age (centered) → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174444.r b/.history/eohi1/regressions e1 - assumptions_20251016174444.r deleted file mode 100644 index 213953d..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174444.r +++ /dev/null @@ -1,394 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print(table(data_clean$demo_sex_numeric)) -print(table(data_clean$demo_edu_numeric)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - print(paste("=== HOMOSCEDASTICITY CHECK:", model_name, "===")) - - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age (centered) → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age (centered) → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174451.r b/.history/eohi1/regressions e1 - assumptions_20251016174451.r deleted file mode 100644 index f2b56f2..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174451.r +++ /dev/null @@ -1,392 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print(table(data_clean$demo_sex_numeric)) -print(table(data_clean$demo_edu_numeric)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - print(paste("=== INDEPENDENCE CHECK:", model_name, "===")) - - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age (centered) → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age (centered) → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174501.r b/.history/eohi1/regressions e1 - assumptions_20251016174501.r deleted file mode 100644 index 31166d8..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174501.r +++ /dev/null @@ -1,390 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print(table(data_clean$demo_sex_numeric)) -print(table(data_clean$demo_edu_numeric)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - print(paste("=== INFLUENCE CHECK:", model_name, "===")) - - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age (centered) → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age (centered) → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174512.r b/.history/eohi1/regressions e1 - assumptions_20251016174512.r deleted file mode 100644 index 8ef951b..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174512.r +++ /dev/null @@ -1,388 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print(table(data_clean$demo_sex_numeric)) -print(table(data_clean$demo_edu_numeric)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - print(paste("=== MODEL SUMMARY:", model_name, "===")) - - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age (centered) → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age (centered) → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174519.r b/.history/eohi1/regressions e1 - assumptions_20251016174519.r deleted file mode 100644 index 647e123..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174519.r +++ /dev/null @@ -1,386 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print(table(data_clean$demo_sex_numeric)) -print(table(data_clean$demo_edu_numeric)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - -print("COMPREHENSIVE REGRESSION ASSUMPTION ANALYSIS") -print("=============================================") - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age (centered) → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age (centered) → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174521.r b/.history/eohi1/regressions e1 - assumptions_20251016174521.r deleted file mode 100644 index bf8aa83..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174521.r +++ /dev/null @@ -1,384 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print(table(data_clean$demo_sex_numeric)) -print(table(data_clean$demo_edu_numeric)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 1: Sex → EOHI-DGEN Mean") -print("=============================================") - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age (centered) → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age (centered) → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174524.r b/.history/eohi1/regressions e1 - assumptions_20251016174524.r deleted file mode 100644 index e60d7c3..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174524.r +++ /dev/null @@ -1,381 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print(table(data_clean$demo_sex_numeric)) -print(table(data_clean$demo_edu_numeric)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 2: Age (centered) → EOHI-DGEN Mean") -print("=============================================") - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age (centered) → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174526.r b/.history/eohi1/regressions e1 - assumptions_20251016174526.r deleted file mode 100644 index aa497c4..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174526.r +++ /dev/null @@ -1,378 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print(table(data_clean$demo_sex_numeric)) -print(table(data_clean$demo_edu_numeric)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean -print("\n=============================================") -print("MODEL 3: Education → EOHI-DGEN Mean") -print("=============================================") - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age (centered) → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174528.r b/.history/eohi1/regressions e1 - assumptions_20251016174528.r deleted file mode 100644 index 7dc7139..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174528.r +++ /dev/null @@ -1,375 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print(table(data_clean$demo_sex_numeric)) -print(table(data_clean$demo_edu_numeric)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean -print("\n=============================================") -print("MODEL 4: Sex → EHI-Global Mean") -print("=============================================") - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age (centered) → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174532.r b/.history/eohi1/regressions e1 - assumptions_20251016174532.r deleted file mode 100644 index b7bfa0b..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174532.r +++ /dev/null @@ -1,372 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print(table(data_clean$demo_sex_numeric)) -print(table(data_clean$demo_edu_numeric)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean -print("\n=============================================") -print("MODEL 5: Age (centered) → EHI-Global Mean") -print("=============================================") - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174537.r b/.history/eohi1/regressions e1 - assumptions_20251016174537.r deleted file mode 100644 index b1ec821..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174537.r +++ /dev/null @@ -1,369 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print(table(data_clean$demo_sex_numeric)) -print(table(data_clean$demo_edu_numeric)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean -print("\n=============================================") -print("MODEL 6: Education → EHI-Global Mean") -print("=============================================") - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174542.r b/.history/eohi1/regressions e1 - assumptions_20251016174542.r deleted file mode 100644 index 0be2cb7..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174542.r +++ /dev/null @@ -1,366 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print(table(data_clean$demo_sex_numeric)) -print(table(data_clean$demo_edu_numeric)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - -print("\n=============================================") -print("ASSUMPTION VIOLATION SUMMARY") -print("=============================================") - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174545.r b/.history/eohi1/regressions e1 - assumptions_20251016174545.r deleted file mode 100644 index 3d8ebf2..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174545.r +++ /dev/null @@ -1,363 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print(table(data_clean$demo_sex_numeric)) -print(table(data_clean$demo_edu_numeric)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - -print("\n=============================================") -print("MODEL COMPARISON SUMMARY") -print("=============================================") - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174548.r b/.history/eohi1/regressions e1 - assumptions_20251016174548.r deleted file mode 100644 index 1ade369..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174548.r +++ /dev/null @@ -1,360 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print(table(data_clean$demo_sex_numeric)) -print(table(data_clean$demo_edu_numeric)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - -print("\n=============================================") -print("ANALYSIS COMPLETE") -print("=============================================") diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174550.r b/.history/eohi1/regressions e1 - assumptions_20251016174550.r deleted file mode 100644 index d61d94c..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174550.r +++ /dev/null @@ -1,357 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print(table(data_clean$demo_sex_numeric)) -print(table(data_clean$demo_edu_numeric)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174557.r b/.history/eohi1/regressions e1 - assumptions_20251016174557.r deleted file mode 100644 index d61d94c..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174557.r +++ /dev/null @@ -1,357 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print(table(data_clean$demo_sex_numeric)) -print(table(data_clean$demo_edu_numeric)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - diff --git a/.history/eohi1/regressions e1 - assumptions_20251016174601.r b/.history/eohi1/regressions e1 - assumptions_20251016174601.r deleted file mode 100644 index d61d94c..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016174601.r +++ /dev/null @@ -1,357 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print(table(data_clean$demo_edu)) - -# Recode education as ordinal (assuming higher values = more education) -edu_levels <- c("High School (or equivalent)", "College Diploma/Certificate", - "University - Undergraduate", "University - Graduate") -data_clean$demo_edu_numeric <- match(data_clean$demo_edu, edu_levels) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print(table(data_clean$demo_sex_numeric)) -print(table(data_clean$demo_edu_numeric)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - diff --git a/.history/eohi1/regressions e1 - assumptions_20251016175238.r b/.history/eohi1/regressions e1 - assumptions_20251016175238.r deleted file mode 100644 index 090886f..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016175238.r +++ /dev/null @@ -1,367 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Original education levels:") -print(table(data_clean$demo_edu)) -print("Education levels:") -print(levels(factor(data_clean$demo_edu))) - -# Create dummy variables for education (k-1 dummy variables) -# First, convert to factor to ensure proper level ordering -data_clean$demo_edu_factor <- factor(data_clean$demo_edu) - -# Create dummy variables (k-1), using the first level as reference -edu_dummies <- model.matrix(~ demo_edu_factor, data = data_clean) -edu_dummies <- edu_dummies[, -1] # Remove intercept column - -# Add dummy variables to dataset -colnames(edu_dummies) <- paste0("edu_", 1:ncol(edu_dummies)) -data_clean <- cbind(data_clean, edu_dummies) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print(table(data_clean$demo_sex_numeric)) -print(table(data_clean$demo_edu_numeric)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - diff --git a/.history/eohi1/regressions e1 - assumptions_20251016175247.r b/.history/eohi1/regressions e1 - assumptions_20251016175247.r deleted file mode 100644 index 6e3b04d..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016175247.r +++ /dev/null @@ -1,373 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Original education levels:") -print(table(data_clean$demo_edu)) -print("Education levels:") -print(levels(factor(data_clean$demo_edu))) - -# Create dummy variables for education (k-1 dummy variables) -# First, convert to factor to ensure proper level ordering -data_clean$demo_edu_factor <- factor(data_clean$demo_edu) - -# Create dummy variables (k-1), using the first level as reference -edu_dummies <- model.matrix(~ demo_edu_factor, data = data_clean) -edu_dummies <- edu_dummies[, -1] # Remove intercept column - -# Add dummy variables to dataset -colnames(edu_dummies) <- paste0("edu_", 1:ncol(edu_dummies)) -data_clean <- cbind(data_clean, edu_dummies) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables created:") -print(paste("Number of dummy variables:", ncol(edu_dummies))) -print("Dummy variable names:") -print(colnames(edu_dummies)) -print("Dummy variable summary:") -print(summary(edu_dummies)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ demo_edu_numeric, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - diff --git a/.history/eohi1/regressions e1 - assumptions_20251016175258.r b/.history/eohi1/regressions e1 - assumptions_20251016175258.r deleted file mode 100644 index a5bb837..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016175258.r +++ /dev/null @@ -1,373 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Original education levels:") -print(table(data_clean$demo_edu)) -print("Education levels:") -print(levels(factor(data_clean$demo_edu))) - -# Create dummy variables for education (k-1 dummy variables) -# First, convert to factor to ensure proper level ordering -data_clean$demo_edu_factor <- factor(data_clean$demo_edu) - -# Create dummy variables (k-1), using the first level as reference -edu_dummies <- model.matrix(~ demo_edu_factor, data = data_clean) -edu_dummies <- edu_dummies[, -1] # Remove intercept column - -# Add dummy variables to dataset -colnames(edu_dummies) <- paste0("edu_", 1:ncol(edu_dummies)) -data_clean <- cbind(data_clean, edu_dummies) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables created:") -print(paste("Number of dummy variables:", ncol(edu_dummies))) -print("Dummy variable names:") -print(colnames(edu_dummies)) -print("Dummy variable summary:") -print(summary(edu_dummies)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean (using dummy variables) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ ., data = data_clean[, c("eohiDGEN_mean", colnames(edu_dummies))]) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean -models$edu_ehi_global <- lm(ehi_global_mean ~ demo_edu_numeric, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - diff --git a/.history/eohi1/regressions e1 - assumptions_20251016175309.r b/.history/eohi1/regressions e1 - assumptions_20251016175309.r deleted file mode 100644 index 051b1ca..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016175309.r +++ /dev/null @@ -1,373 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Original education levels:") -print(table(data_clean$demo_edu)) -print("Education levels:") -print(levels(factor(data_clean$demo_edu))) - -# Create dummy variables for education (k-1 dummy variables) -# First, convert to factor to ensure proper level ordering -data_clean$demo_edu_factor <- factor(data_clean$demo_edu) - -# Create dummy variables (k-1), using the first level as reference -edu_dummies <- model.matrix(~ demo_edu_factor, data = data_clean) -edu_dummies <- edu_dummies[, -1] # Remove intercept column - -# Add dummy variables to dataset -colnames(edu_dummies) <- paste0("edu_", 1:ncol(edu_dummies)) -data_clean <- cbind(data_clean, edu_dummies) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables created:") -print(paste("Number of dummy variables:", ncol(edu_dummies))) -print("Dummy variable names:") -print(colnames(edu_dummies)) -print("Dummy variable summary:") -print(summary(edu_dummies)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean (using dummy variables) -models$edu_eohiDGEN <- lm(eohiDGEN_mean ~ ., data = data_clean[, c("eohiDGEN_mean", colnames(edu_dummies))]) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean (using dummy variables) -models$edu_ehi_global <- lm(ehi_global_mean ~ ., data = data_clean[, c("ehi_global_mean", colnames(edu_dummies))]) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - diff --git a/.history/eohi1/regressions e1 - assumptions_20251016175319.r b/.history/eohi1/regressions e1 - assumptions_20251016175319.r deleted file mode 100644 index 6815bf0..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016175319.r +++ /dev/null @@ -1,374 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Original education levels:") -print(table(data_clean$demo_edu)) -print("Education levels:") -print(levels(factor(data_clean$demo_edu))) - -# Create dummy variables for education (k-1 dummy variables) -# First, convert to factor to ensure proper level ordering -data_clean$demo_edu_factor <- factor(data_clean$demo_edu) - -# Create dummy variables (k-1), using the first level as reference -edu_dummies <- model.matrix(~ demo_edu_factor, data = data_clean) -edu_dummies <- edu_dummies[, -1] # Remove intercept column - -# Add dummy variables to dataset -colnames(edu_dummies) <- paste0("edu_", 1:ncol(edu_dummies)) -data_clean <- cbind(data_clean, edu_dummies) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables created:") -print(paste("Number of dummy variables:", ncol(edu_dummies))) -print("Dummy variable names:") -print(colnames(edu_dummies)) -print("Dummy variable summary:") -print(summary(edu_dummies)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean (using dummy variables) -edu_formula <- as.formula(paste("eohiDGEN_mean ~", paste(colnames(edu_dummies), collapse = " + "))) -models$edu_eohiDGEN <- lm(edu_formula, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean (using dummy variables) -models$edu_ehi_global <- lm(ehi_global_mean ~ ., data = data_clean[, c("ehi_global_mean", colnames(edu_dummies))]) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - diff --git a/.history/eohi1/regressions e1 - assumptions_20251016175325.r b/.history/eohi1/regressions e1 - assumptions_20251016175325.r deleted file mode 100644 index 47b31ac..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016175325.r +++ /dev/null @@ -1,375 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Original education levels:") -print(table(data_clean$demo_edu)) -print("Education levels:") -print(levels(factor(data_clean$demo_edu))) - -# Create dummy variables for education (k-1 dummy variables) -# First, convert to factor to ensure proper level ordering -data_clean$demo_edu_factor <- factor(data_clean$demo_edu) - -# Create dummy variables (k-1), using the first level as reference -edu_dummies <- model.matrix(~ demo_edu_factor, data = data_clean) -edu_dummies <- edu_dummies[, -1] # Remove intercept column - -# Add dummy variables to dataset -colnames(edu_dummies) <- paste0("edu_", 1:ncol(edu_dummies)) -data_clean <- cbind(data_clean, edu_dummies) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables created:") -print(paste("Number of dummy variables:", ncol(edu_dummies))) -print("Dummy variable names:") -print(colnames(edu_dummies)) -print("Dummy variable summary:") -print(summary(edu_dummies)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean (using dummy variables) -edu_formula <- as.formula(paste("eohiDGEN_mean ~", paste(colnames(edu_dummies), collapse = " + "))) -models$edu_eohiDGEN <- lm(edu_formula, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean (using dummy variables) -edu_formula_global <- as.formula(paste("ehi_global_mean ~", paste(colnames(edu_dummies), collapse = " + "))) -models$edu_ehi_global <- lm(edu_formula_global, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - diff --git a/.history/eohi1/regressions e1 - assumptions_20251016175340.r b/.history/eohi1/regressions e1 - assumptions_20251016175340.r deleted file mode 100644 index 47b31ac..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016175340.r +++ /dev/null @@ -1,375 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Original education levels:") -print(table(data_clean$demo_edu)) -print("Education levels:") -print(levels(factor(data_clean$demo_edu))) - -# Create dummy variables for education (k-1 dummy variables) -# First, convert to factor to ensure proper level ordering -data_clean$demo_edu_factor <- factor(data_clean$demo_edu) - -# Create dummy variables (k-1), using the first level as reference -edu_dummies <- model.matrix(~ demo_edu_factor, data = data_clean) -edu_dummies <- edu_dummies[, -1] # Remove intercept column - -# Add dummy variables to dataset -colnames(edu_dummies) <- paste0("edu_", 1:ncol(edu_dummies)) -data_clean <- cbind(data_clean, edu_dummies) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables created:") -print(paste("Number of dummy variables:", ncol(edu_dummies))) -print("Dummy variable names:") -print(colnames(edu_dummies)) -print("Dummy variable summary:") -print(summary(edu_dummies)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean (using dummy variables) -edu_formula <- as.formula(paste("eohiDGEN_mean ~", paste(colnames(edu_dummies), collapse = " + "))) -models$edu_eohiDGEN <- lm(edu_formula, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean (using dummy variables) -edu_formula_global <- as.formula(paste("ehi_global_mean ~", paste(colnames(edu_dummies), collapse = " + "))) -models$edu_ehi_global <- lm(edu_formula_global, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - diff --git a/.history/eohi1/regressions e1 - assumptions_20251016175550.r b/.history/eohi1/regressions e1 - assumptions_20251016175550.r deleted file mode 100644 index 47b31ac..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016175550.r +++ /dev/null @@ -1,375 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Original education levels:") -print(table(data_clean$demo_edu)) -print("Education levels:") -print(levels(factor(data_clean$demo_edu))) - -# Create dummy variables for education (k-1 dummy variables) -# First, convert to factor to ensure proper level ordering -data_clean$demo_edu_factor <- factor(data_clean$demo_edu) - -# Create dummy variables (k-1), using the first level as reference -edu_dummies <- model.matrix(~ demo_edu_factor, data = data_clean) -edu_dummies <- edu_dummies[, -1] # Remove intercept column - -# Add dummy variables to dataset -colnames(edu_dummies) <- paste0("edu_", 1:ncol(edu_dummies)) -data_clean <- cbind(data_clean, edu_dummies) - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables created:") -print(paste("Number of dummy variables:", ncol(edu_dummies))) -print("Dummy variable names:") -print(colnames(edu_dummies)) -print("Dummy variable summary:") -print(summary(edu_dummies)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean (using dummy variables) -edu_formula <- as.formula(paste("eohiDGEN_mean ~", paste(colnames(edu_dummies), collapse = " + "))) -models$edu_eohiDGEN <- lm(edu_formula, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean (using dummy variables) -edu_formula_global <- as.formula(paste("ehi_global_mean ~", paste(colnames(edu_dummies), collapse = " + "))) -models$edu_ehi_global <- lm(edu_formula_global, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - diff --git a/.history/eohi1/regressions e1 - assumptions_20251016175653.r b/.history/eohi1/regressions e1 - assumptions_20251016175653.r deleted file mode 100644 index 4472b84..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016175653.r +++ /dev/null @@ -1,383 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Original education levels:") -print(table(data_clean$demo_edu)) -print("Education levels:") -print(levels(factor(data_clean$demo_edu))) - -# Create dummy variables for education (k-1 dummy variables) -# First, convert to factor to ensure proper level ordering -data_clean$demo_edu_factor <- factor(data_clean$demo_edu) - -# Create dummy variables (k-1), using the first level as reference -edu_dummies <- model.matrix(~ demo_edu_factor, data = data_clean) -edu_dummies <- edu_dummies[, -1] # Remove intercept column - -# Add dummy variables to dataset -colnames(edu_dummies) <- paste0("edu_", 1:ncol(edu_dummies)) -data_clean <- cbind(data_clean, edu_dummies) - -# Show which education level each dummy variable represents -print("Education level mapping:") -edu_levels_actual <- levels(data_clean$demo_edu_factor) -print(paste("Reference category (edu_0):", edu_levels_actual[1])) -for(i in 1:ncol(edu_dummies)) { - print(paste("edu_", i, " represents:", edu_levels_actual[i+1])) -} - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables created:") -print(paste("Number of dummy variables:", ncol(edu_dummies))) -print("Dummy variable names:") -print(colnames(edu_dummies)) -print("Dummy variable summary:") -print(summary(edu_dummies)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean (using dummy variables) -edu_formula <- as.formula(paste("eohiDGEN_mean ~", paste(colnames(edu_dummies), collapse = " + "))) -models$edu_eohiDGEN <- lm(edu_formula, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean (using dummy variables) -edu_formula_global <- as.formula(paste("ehi_global_mean ~", paste(colnames(edu_dummies), collapse = " + "))) -models$edu_ehi_global <- lm(edu_formula_global, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - diff --git a/.history/eohi1/regressions e1 - assumptions_20251016175707.r b/.history/eohi1/regressions e1 - assumptions_20251016175707.r deleted file mode 100644 index 4472b84..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016175707.r +++ /dev/null @@ -1,383 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Original education levels:") -print(table(data_clean$demo_edu)) -print("Education levels:") -print(levels(factor(data_clean$demo_edu))) - -# Create dummy variables for education (k-1 dummy variables) -# First, convert to factor to ensure proper level ordering -data_clean$demo_edu_factor <- factor(data_clean$demo_edu) - -# Create dummy variables (k-1), using the first level as reference -edu_dummies <- model.matrix(~ demo_edu_factor, data = data_clean) -edu_dummies <- edu_dummies[, -1] # Remove intercept column - -# Add dummy variables to dataset -colnames(edu_dummies) <- paste0("edu_", 1:ncol(edu_dummies)) -data_clean <- cbind(data_clean, edu_dummies) - -# Show which education level each dummy variable represents -print("Education level mapping:") -edu_levels_actual <- levels(data_clean$demo_edu_factor) -print(paste("Reference category (edu_0):", edu_levels_actual[1])) -for(i in 1:ncol(edu_dummies)) { - print(paste("edu_", i, " represents:", edu_levels_actual[i+1])) -} - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables created:") -print(paste("Number of dummy variables:", ncol(edu_dummies))) -print("Dummy variable names:") -print(colnames(edu_dummies)) -print("Dummy variable summary:") -print(summary(edu_dummies)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean (using dummy variables) -edu_formula <- as.formula(paste("eohiDGEN_mean ~", paste(colnames(edu_dummies), collapse = " + "))) -models$edu_eohiDGEN <- lm(edu_formula, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean (using dummy variables) -edu_formula_global <- as.formula(paste("ehi_global_mean ~", paste(colnames(edu_dummies), collapse = " + "))) -models$edu_ehi_global <- lm(edu_formula_global, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - diff --git a/.history/eohi1/regressions e1 - assumptions_20251016175808.r b/.history/eohi1/regressions e1 - assumptions_20251016175808.r deleted file mode 100644 index 4472b84..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016175808.r +++ /dev/null @@ -1,383 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Original education levels:") -print(table(data_clean$demo_edu)) -print("Education levels:") -print(levels(factor(data_clean$demo_edu))) - -# Create dummy variables for education (k-1 dummy variables) -# First, convert to factor to ensure proper level ordering -data_clean$demo_edu_factor <- factor(data_clean$demo_edu) - -# Create dummy variables (k-1), using the first level as reference -edu_dummies <- model.matrix(~ demo_edu_factor, data = data_clean) -edu_dummies <- edu_dummies[, -1] # Remove intercept column - -# Add dummy variables to dataset -colnames(edu_dummies) <- paste0("edu_", 1:ncol(edu_dummies)) -data_clean <- cbind(data_clean, edu_dummies) - -# Show which education level each dummy variable represents -print("Education level mapping:") -edu_levels_actual <- levels(data_clean$demo_edu_factor) -print(paste("Reference category (edu_0):", edu_levels_actual[1])) -for(i in 1:ncol(edu_dummies)) { - print(paste("edu_", i, " represents:", edu_levels_actual[i+1])) -} - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables created:") -print(paste("Number of dummy variables:", ncol(edu_dummies))) -print("Dummy variable names:") -print(colnames(edu_dummies)) -print("Dummy variable summary:") -print(summary(edu_dummies)) -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean (using dummy variables) -edu_formula <- as.formula(paste("eohiDGEN_mean ~", paste(colnames(edu_dummies), collapse = " + "))) -models$edu_eohiDGEN <- lm(edu_formula, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean (using dummy variables) -edu_formula_global <- as.formula(paste("ehi_global_mean ~", paste(colnames(edu_dummies), collapse = " + "))) -models$edu_ehi_global <- lm(edu_formula_global, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - diff --git a/.history/eohi1/regressions e1 - assumptions_20251016175912.r b/.history/eohi1/regressions e1 - assumptions_20251016175912.r deleted file mode 100644 index 3c3aacc..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016175912.r +++ /dev/null @@ -1,390 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Original education levels:") -print(table(data_clean$demo_edu)) -print("Education levels:") -print(levels(factor(data_clean$demo_edu))) - -# Create dummy variables for education (k-1 dummy variables) -# First, convert to factor to ensure proper level ordering -data_clean$demo_edu_factor <- factor(data_clean$demo_edu) - -# Create dummy variables (k-1), using the first level as reference -edu_dummies <- model.matrix(~ demo_edu_factor, data = data_clean) -edu_dummies <- edu_dummies[, -1] # Remove intercept column - -# Add dummy variables to dataset -colnames(edu_dummies) <- paste0("edu_", 1:ncol(edu_dummies)) -data_clean <- cbind(data_clean, edu_dummies) - -# Show which education level each dummy variable represents -print("Education level mapping:") -edu_levels_actual <- levels(data_clean$demo_edu_factor) -print(paste("Reference category (edu_0):", edu_levels_actual[1])) -for(i in 1:ncol(edu_dummies)) { - print(paste("edu_", i, " represents:", edu_levels_actual[i+1])) -} - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables created:") -print(paste("Number of dummy variables:", ncol(edu_dummies))) -print("Dummy variable names:") -print(colnames(edu_dummies)) -print("Dummy variable summary:") -print(summary(edu_dummies)) - -# Show distribution (counts) for each dummy variable -print("Dummy variable distributions (counts):") -for(i in 1:ncol(edu_dummies)) { - print(paste("edu_", i, " distribution:")) - print(table(edu_dummies[,i])) -} -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean (using dummy variables) -edu_formula <- as.formula(paste("eohiDGEN_mean ~", paste(colnames(edu_dummies), collapse = " + "))) -models$edu_eohiDGEN <- lm(edu_formula, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean (using dummy variables) -edu_formula_global <- as.formula(paste("ehi_global_mean ~", paste(colnames(edu_dummies), collapse = " + "))) -models$edu_ehi_global <- lm(edu_formula_global, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - diff --git a/.history/eohi1/regressions e1 - assumptions_20251016175924.r b/.history/eohi1/regressions e1 - assumptions_20251016175924.r deleted file mode 100644 index 3c3aacc..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016175924.r +++ /dev/null @@ -1,390 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Original education levels:") -print(table(data_clean$demo_edu)) -print("Education levels:") -print(levels(factor(data_clean$demo_edu))) - -# Create dummy variables for education (k-1 dummy variables) -# First, convert to factor to ensure proper level ordering -data_clean$demo_edu_factor <- factor(data_clean$demo_edu) - -# Create dummy variables (k-1), using the first level as reference -edu_dummies <- model.matrix(~ demo_edu_factor, data = data_clean) -edu_dummies <- edu_dummies[, -1] # Remove intercept column - -# Add dummy variables to dataset -colnames(edu_dummies) <- paste0("edu_", 1:ncol(edu_dummies)) -data_clean <- cbind(data_clean, edu_dummies) - -# Show which education level each dummy variable represents -print("Education level mapping:") -edu_levels_actual <- levels(data_clean$demo_edu_factor) -print(paste("Reference category (edu_0):", edu_levels_actual[1])) -for(i in 1:ncol(edu_dummies)) { - print(paste("edu_", i, " represents:", edu_levels_actual[i+1])) -} - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables created:") -print(paste("Number of dummy variables:", ncol(edu_dummies))) -print("Dummy variable names:") -print(colnames(edu_dummies)) -print("Dummy variable summary:") -print(summary(edu_dummies)) - -# Show distribution (counts) for each dummy variable -print("Dummy variable distributions (counts):") -for(i in 1:ncol(edu_dummies)) { - print(paste("edu_", i, " distribution:")) - print(table(edu_dummies[,i])) -} -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean (using dummy variables) -edu_formula <- as.formula(paste("eohiDGEN_mean ~", paste(colnames(edu_dummies), collapse = " + "))) -models$edu_eohiDGEN <- lm(edu_formula, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean (using dummy variables) -edu_formula_global <- as.formula(paste("ehi_global_mean ~", paste(colnames(edu_dummies), collapse = " + "))) -models$edu_ehi_global <- lm(edu_formula_global, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - diff --git a/.history/eohi1/regressions e1 - assumptions_20251016175945.r b/.history/eohi1/regressions e1 - assumptions_20251016175945.r deleted file mode 100644 index 3c3aacc..0000000 --- a/.history/eohi1/regressions e1 - assumptions_20251016175945.r +++ /dev/null @@ -1,390 +0,0 @@ -# Regression Analysis - Assumption Checking -# IVs: demo_sex, demo_age, demo_edu -# DVs: eohiDGEN_mean, ehi_global_mean - -options(scipen = 999) - -library(car) -library(performance) -#library(see) -library(ggplot2) -library(gridExtra) -library(dplyr) -library(lmtest) # For bptest and durbinWatsonTest - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") -data <- read.csv("ehi1.csv") - -# Check for missing values -missing_summary <- data %>% - select(demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - summarise_all(~sum(is.na(.))) -print(missing_summary) - -# Remove rows with missing values -data_clean <- data %>% - select(pID, demo_sex, demo_age_1, demo_edu, eohiDGEN_mean, ehi_global_mean) %>% - filter(complete.cases(.)) - -print(paste("Clean data dimensions:", paste(dim(data_clean), collapse = " x "))) - -# Recode demo_sex as numeric for regression (0 = Female, 1 = Male) -data_clean$demo_sex_numeric <- ifelse(data_clean$demo_sex == "Male", 1, 0) - -# Check demo_edu levels and recode if needed -print("Original education levels:") -print(table(data_clean$demo_edu)) -print("Education levels:") -print(levels(factor(data_clean$demo_edu))) - -# Create dummy variables for education (k-1 dummy variables) -# First, convert to factor to ensure proper level ordering -data_clean$demo_edu_factor <- factor(data_clean$demo_edu) - -# Create dummy variables (k-1), using the first level as reference -edu_dummies <- model.matrix(~ demo_edu_factor, data = data_clean) -edu_dummies <- edu_dummies[, -1] # Remove intercept column - -# Add dummy variables to dataset -colnames(edu_dummies) <- paste0("edu_", 1:ncol(edu_dummies)) -data_clean <- cbind(data_clean, edu_dummies) - -# Show which education level each dummy variable represents -print("Education level mapping:") -edu_levels_actual <- levels(data_clean$demo_edu_factor) -print(paste("Reference category (edu_0):", edu_levels_actual[1])) -for(i in 1:ncol(edu_dummies)) { - print(paste("edu_", i, " represents:", edu_levels_actual[i+1])) -} - -# Center the age variable for regression analysis -data_clean$demo_age_1_centered <- scale(data_clean$demo_age_1, center = TRUE, scale = FALSE)[,1] - -# Verify recoding -print("Sex recoding (0=Female, 1=Male):") -print(table(data_clean$demo_sex_numeric)) -print("Education dummy variables created:") -print(paste("Number of dummy variables:", ncol(edu_dummies))) -print("Dummy variable names:") -print(colnames(edu_dummies)) -print("Dummy variable summary:") -print(summary(edu_dummies)) - -# Show distribution (counts) for each dummy variable -print("Dummy variable distributions (counts):") -for(i in 1:ncol(edu_dummies)) { - print(paste("edu_", i, " distribution:")) - print(table(edu_dummies[,i])) -} -print(paste("Original age mean:", round(mean(data_clean$demo_age_1), 2))) -print(paste("Centered age mean:", round(mean(data_clean$demo_age_1_centered), 2))) - -# ============================================================================= -# REGRESSION MODELS -# ============================================================================= - -# Define the 6 regression models -models <- list() - -# Model 1: demo_sex → eohiDGEN_mean -models$sex_eohiDGEN <- lm(eohiDGEN_mean ~ demo_sex_numeric, data = data_clean) - -# Model 2: demo_age_1 → eohiDGEN_mean (centered) -models$age_eohiDGEN <- lm(eohiDGEN_mean ~ demo_age_1_centered, data = data_clean) - -# Model 3: demo_edu → eohiDGEN_mean (using dummy variables) -edu_formula <- as.formula(paste("eohiDGEN_mean ~", paste(colnames(edu_dummies), collapse = " + "))) -models$edu_eohiDGEN <- lm(edu_formula, data = data_clean) - -# Model 4: demo_sex → ehi_global_mean -models$sex_ehi_global <- lm(ehi_global_mean ~ demo_sex_numeric, data = data_clean) - -# Model 5: demo_age_1 → ehi_global_mean (centered) -models$age_ehi_global <- lm(ehi_global_mean ~ demo_age_1_centered, data = data_clean) - -# Model 6: demo_edu → ehi_global_mean (using dummy variables) -edu_formula_global <- as.formula(paste("ehi_global_mean ~", paste(colnames(edu_dummies), collapse = " + "))) -models$edu_ehi_global <- lm(edu_formula_global, data = data_clean) - -# ============================================================================= -# ASSUMPTION CHECKING FUNCTIONS -# ============================================================================= - -# Function to check linearity assumption -check_linearity <- function(model, model_name) { - # Residuals vs Fitted plot - plot(model, which = 1, main = paste("Linearity:", model_name)) - - # Component + residual plot (partial residual plot) - crPlots(model, main = paste("Component+Residual Plot:", model_name)) - - return(NULL) -} - -# Function to check normality of residuals -check_normality <- function(model, model_name) { - # Q-Q plot - plot(model, which = 2, main = paste("Q-Q Plot:", model_name)) - - # Shapiro-Wilk test - residuals <- residuals(model) - shapiro_test <- shapiro.test(residuals) - print(paste("Shapiro-Wilk test p-value:", format(shapiro_test$p.value, digits = 5))) - - # Kolmogorov-Smirnov test - ks_test <- ks.test(residuals, "pnorm", mean(residuals), sd(residuals)) - print(paste("Kolmogorov-Smirnov test p-value:", format(ks_test$p.value, digits = 5))) - - # Histogram of residuals - hist_plot <- ggplot(data.frame(residuals = residuals), aes(x = residuals)) + - geom_histogram(bins = 30, fill = "lightblue", color = "black") + - ggtitle(paste("Residuals Histogram:", model_name)) + - theme_minimal() - print(hist_plot) - - return(list(shapiro_p = shapiro_test$p.value, ks_p = ks_test$p.value)) -} - -# Function to check homoscedasticity (constant variance) -check_homoscedasticity <- function(model, model_name) { - # Scale-Location plot - plot(model, which = 3, main = paste("Scale-Location Plot:", model_name)) - - # Breusch-Pagan test - bp_test <- bptest(model) - print(paste("Breusch-Pagan test p-value:", format(bp_test$p.value, digits = 5))) - - # White test (if available) - tryCatch({ - white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2)) - print(paste("White test p-value:", format(white_test$p.value, digits = 5))) - }, error = function(e) { - print("White test not available for this model") - }) - - return(list(bp_p = bp_test$p.value)) -} - -# Function to check independence (no autocorrelation) -check_independence <- function(model, model_name) { - # Durbin-Watson test - dw_test <- durbinWatsonTest(model) - print(paste("Durbin-Watson statistic:", format(dw_test$dw, digits = 5))) - print(paste("Durbin-Watson p-value:", format(dw_test$p, digits = 5))) - - # Residuals vs Order plot - residuals_vs_order <- ggplot(data.frame( - residuals = residuals(model), - order = seq_along(residuals(model)) - ), aes(x = order, y = residuals)) + - geom_point(color = "black") + - geom_hline(yintercept = 0, linetype = "dashed", color = "red") + - ggtitle(paste("Residuals vs Order:", model_name)) + - theme_minimal() - print(residuals_vs_order) - - return(list(dw_stat = dw_test$dw, dw_p = dw_test$p)) -} - -# Function to check for influential observations -check_influence <- function(model, model_name) { - # Cook's Distance plot - plot(model, which = 4, main = paste("Cook's Distance:", model_name)) - - # Calculate influence measures - cooks_d <- cooks.distance(model) - leverage <- hatvalues(model) - dffits_val <- dffits(model) - - # Identify influential observations - cooks_threshold <- 4/length(cooks_d) # Cook's D threshold - leverage_threshold <- 2 * (length(coef(model))/nobs(model)) # Leverage threshold - dffits_threshold <- 2 * sqrt(length(coef(model))/nobs(model)) # DFFITS threshold - - influential_cooks <- which(cooks_d > cooks_threshold) - influential_leverage <- which(leverage > leverage_threshold) - influential_dffits <- which(abs(dffits_val) > dffits_threshold) - - print(paste("Cook's Distance threshold:", format(cooks_threshold, digits = 5))) - print(paste("Influential observations (Cook's D):", length(influential_cooks))) - print(paste("Leverage threshold:", format(leverage_threshold, digits = 5))) - print(paste("High leverage observations:", length(influential_leverage))) - print(paste("DFFITS threshold:", format(dffits_threshold, digits = 5))) - print(paste("Influential observations (DFFITS):", length(influential_dffits))) - - if (length(influential_cooks) > 0) { - print(paste("Cook's D influential cases:", paste(influential_cooks, collapse = ", "))) - } - if (length(influential_leverage) > 0) { - print(paste("High leverage cases:", paste(influential_leverage, collapse = ", "))) - } - if (length(influential_dffits) > 0) { - print(paste("DFFITS influential cases:", paste(influential_dffits, collapse = ", "))) - } - - return(list(influential_cooks = influential_cooks, - influential_leverage = influential_leverage, - influential_dffits = influential_dffits)) -} - -# Function to get comprehensive model summary -get_model_summary <- function(model, model_name) { - # Basic model summary - summary_model <- summary(model) - print(summary_model) - - # R-squared and adjusted R-squared - print(paste("R-squared:", format(summary_model$r.squared, digits = 5))) - print(paste("Adjusted R-squared:", format(summary_model$adj.r.squared, digits = 5))) - - # AIC and BIC - aic_val <- AIC(model) - bic_val <- BIC(model) - print(paste("AIC:", format(aic_val, digits = 5))) - print(paste("BIC:", format(bic_val, digits = 5))) - - return(list(summary = summary_model, r_squared = summary_model$r.squared, - adj_r_squared = summary_model$adj.r.squared, aic = aic_val, bic = bic_val)) -} - -# ============================================================================= -# RUN ASSUMPTION CHECKS - MODEL BY MODEL -# ============================================================================= - - -# Model 1: Sex → EOHI-DGEN Mean - -model1_summary <- get_model_summary(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_normality <- check_normality(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_homosced <- check_homoscedasticity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_independence <- check_independence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_influence <- check_influence(models$sex_eohiDGEN, "Sex → EOHI-DGEN") -model1_linearity <- check_linearity(models$sex_eohiDGEN, "Sex → EOHI-DGEN") - -# Model 2: Age (centered) → EOHI-DGEN Mean - -model2_summary <- get_model_summary(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_normality <- check_normality(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_homosced <- check_homoscedasticity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_independence <- check_independence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_influence <- check_influence(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") -model2_linearity <- check_linearity(models$age_eohiDGEN, "Age (centered) → EOHI-DGEN") - -# Model 3: Education → EOHI-DGEN Mean - -model3_summary <- get_model_summary(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_normality <- check_normality(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_homosced <- check_homoscedasticity(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_independence <- check_independence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_influence <- check_influence(models$edu_eohiDGEN, "Education → EOHI-DGEN") -model3_linearity <- check_linearity(models$edu_eohiDGEN, "Education → EOHI-DGEN") - -# Model 4: Sex → EHI-Global Mean - -model4_summary <- get_model_summary(models$sex_ehi_global, "Sex → EHI-Global") -model4_normality <- check_normality(models$sex_ehi_global, "Sex → EHI-Global") -model4_homosced <- check_homoscedasticity(models$sex_ehi_global, "Sex → EHI-Global") -model4_independence <- check_independence(models$sex_ehi_global, "Sex → EHI-Global") -model4_influence <- check_influence(models$sex_ehi_global, "Sex → EHI-Global") -model4_linearity <- check_linearity(models$sex_ehi_global, "Sex → EHI-Global") - -# Model 5: Age (centered) → EHI-Global Mean - -model5_summary <- get_model_summary(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_normality <- check_normality(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_homosced <- check_homoscedasticity(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_independence <- check_independence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_influence <- check_influence(models$age_ehi_global, "Age (centered) → EHI-Global") -model5_linearity <- check_linearity(models$age_ehi_global, "Age (centered) → EHI-Global") - -# Model 6: Education → EHI-Global Mean - -model6_summary <- get_model_summary(models$edu_ehi_global, "Education → EHI-Global") -model6_normality <- check_normality(models$edu_ehi_global, "Education → EHI-Global") -model6_homosced <- check_homoscedasticity(models$edu_ehi_global, "Education → EHI-Global") -model6_independence <- check_independence(models$edu_ehi_global, "Education → EHI-Global") -model6_influence <- check_influence(models$edu_ehi_global, "Education → EHI-Global") -model6_linearity <- check_linearity(models$edu_ehi_global, "Education → EHI-Global") - -# ============================================================================= -# SUMMARY TABLE OF ASSUMPTION VIOLATIONS -# ============================================================================= - - -# Create summary table -violation_summary <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - Normality = c( - ifelse(model1_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_normality$shapiro_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_normality$shapiro_p < 0.05, "VIOLATED", "OK") - ), - Homoscedasticity = c( - ifelse(model1_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_homosced$bp_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_homosced$bp_p < 0.05, "VIOLATED", "OK") - ), - Independence = c( - ifelse(model1_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model2_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model3_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model4_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model5_independence$dw_p < 0.05, "VIOLATED", "OK"), - ifelse(model6_independence$dw_p < 0.05, "VIOLATED", "OK") - ), - Influential_Obs = c( - ifelse(length(model1_influence$influential_cooks) > 0, - paste("YES (", length(model1_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model2_influence$influential_cooks) > 0, - paste("YES (", length(model2_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model3_influence$influential_cooks) > 0, - paste("YES (", length(model3_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model4_influence$influential_cooks) > 0, - paste("YES (", length(model4_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model5_influence$influential_cooks) > 0, - paste("YES (", length(model5_influence$influential_cooks), ")", sep = ""), "NO"), - ifelse(length(model6_influence$influential_cooks) > 0, - paste("YES (", length(model6_influence$influential_cooks), ")", sep = ""), "NO") - ), - stringsAsFactors = FALSE -) - -print(violation_summary) - -# ============================================================================= -# MODEL COMPARISON TABLE -# ============================================================================= - - -# Create model comparison table -comparison_table <- data.frame( - Model = c("Sex → EOHI-DGEN", "Age (centered) → EOHI-DGEN", "Education → EOHI-DGEN", - "Sex → EHI-Global", "Age (centered) → EHI-Global", "Education → EHI-Global"), - R_Squared = c(model1_summary$r_squared, model2_summary$r_squared, model3_summary$r_squared, - model4_summary$r_squared, model5_summary$r_squared, model6_summary$r_squared), - Adj_R_Squared = c(model1_summary$adj_r_squared, model2_summary$adj_r_squared, model3_summary$adj_r_squared, - model4_summary$adj_r_squared, model5_summary$adj_r_squared, model6_summary$adj_r_squared), - AIC = c(model1_summary$aic, model2_summary$aic, model3_summary$aic, - model4_summary$aic, model5_summary$aic, model6_summary$aic), - BIC = c(model1_summary$bic, model2_summary$bic, model3_summary$bic, - model4_summary$bic, model5_summary$bic, model6_summary$bic), - Significant = c( - ifelse(model1_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model2_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model3_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model4_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model5_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO"), - ifelse(model6_summary$summary$coefficients[2, 4] < 0.05, "YES", "NO") - ), - stringsAsFactors = FALSE -) - -print(comparison_table) - diff --git a/.history/eohi1/reliability_analysis_cronbach_alpha_20250917154720.r b/.history/eohi1/reliability_analysis_cronbach_alpha_20250917154720.r deleted file mode 100644 index a12e156..0000000 --- a/.history/eohi1/reliability_analysis_cronbach_alpha_20250917154720.r +++ /dev/null @@ -1,202 +0,0 @@ -# Cronbach's Alpha Reliability Analysis -# For 4 domains (preferences, personality, values, life satisfaction) at 2 time points -# 5 items per domain per time point - -# Load required libraries -library(psych) -library(dplyr) -library(tidyr) - -# Read the data -data <- read.csv("exp1.csv") - -# Define the scale variables for each domain and time point -# Past time point scales -past_pref_vars <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", - "NPastDiff_pref_nap", "NPastDiff_pref_travel") - -past_pers_vars <- c("NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", - "NPastDiff_pers_anxious", "NPastDiff_pers_complex") - -past_val_vars <- c("NPastDiff_val_obey", "NPastDiff_val_trad", "NPastDiff_val_opinion", - "NPastDiff_val_performance", "NPastDiff_val_justice") - -past_life_vars <- c("NPastDiff_life_ideal", "NPastDiff_life_excellent", "NPastDiff_life_satisfied", - "NPastDiff_life_important", "NPastDiff_life_change") - -# Future time point scales -fut_pref_vars <- c("NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", - "NFutDiff_pref_nap", "NFutDiff_pref_travel") - -fut_pers_vars <- c("NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", - "NFutDiff_pers_anxious", "NFutDiff_pers_complex") - -fut_val_vars <- c("NFutDiff_val_obey", "NFutDiff_val_trad", "NFutDiff_val_opinion", - "NFutDiff_val_performance", "NFutDiff_val_justice") - -fut_life_vars <- c("NFutDiff_life_ideal", "NFutDiff_life_excellent", "NFutDiff_life_satisfied", - "NFutDiff_life_important", "NFutDiff_life_change") - -# Function to calculate Cronbach's alpha and return detailed results -calc_cronbach_alpha <- function(data, var_names, scale_name) { - # Check for missing values - scale_data <- data[, var_names] - missing_info <- data.frame( - Variable = var_names, - Missing_Count = colSums(is.na(scale_data)), - Missing_Percent = round(colSums(is.na(scale_data)) / nrow(scale_data) * 100, 2) - ) - - # Remove rows with any missing values for reliability analysis - complete_data <- scale_data[complete.cases(scale_data), ] - - cat("\n", "="*60, "\n") - cat("SCALE:", scale_name, "\n") - cat("="*60, "\n") - - cat("Sample size for reliability analysis:", nrow(complete_data), "\n") - cat("Original sample size:", nrow(data), "\n") - cat("Cases removed due to missing data:", nrow(data) - nrow(complete_data), "\n\n") - - cat("Missing data summary:\n") - print(missing_info) - - if(nrow(complete_data) < 3) { - cat("\nWARNING: Insufficient complete cases for reliability analysis\n") - return(NULL) - } - - # Calculate Cronbach's alpha - alpha_result <- alpha(complete_data, check.keys = TRUE) - - cat("\nCronbach's Alpha Results:\n") - cat("Raw alpha:", round(alpha_result$total$raw_alpha, 4), "\n") - cat("Standardized alpha:", round(alpha_result$total$std.alpha, 4), "\n") - cat("Average inter-item correlation:", round(alpha_result$total$average_r, 4), "\n") - cat("Number of items:", alpha_result$total$nvar, "\n") - - # Item statistics - cat("\nItem Statistics:\n") - item_stats <- data.frame( - Item = var_names, - Alpha_if_deleted = round(alpha_result$alpha.drop$raw_alpha, 4), - Item_total_correlation = round(alpha_result$item.stats$r.drop, 4), - Mean = round(alpha_result$item.stats$mean, 4), - SD = round(alpha_result$item.stats$sd, 4) - ) - print(item_stats) - - # Check assumptions - cat("\nAssumption Checks:\n") - - # 1. Check for sufficient sample size (minimum 30 recommended) - sample_size_ok <- nrow(complete_data) >= 30 - cat("Sample size adequate (≥30):", sample_size_ok, "\n") - - # 2. Check for adequate inter-item correlations (should be > 0.30) - inter_item_cors <- cor(complete_data) - inter_item_cors[lower.tri(inter_item_cors)] <- NA - diag(inter_item_cors) <- NA - inter_item_cors_flat <- as.vector(inter_item_cors) - inter_item_cors_flat <- inter_item_cors_flat[!is.na(inter_item_cors_flat)] - adequate_cors <- sum(inter_item_cors_flat > 0.30) / length(inter_item_cors_flat) - cat("Proportion of inter-item correlations > 0.30:", round(adequate_cors, 4), "\n") - - # 3. Check for negative correlations (concerning for unidimensionality) - negative_cors <- sum(inter_item_cors_flat < 0, na.rm = TRUE) - cat("Number of negative inter-item correlations:", negative_cors, "\n") - - # 4. Check item variances (should be roughly similar) - item_vars <- apply(complete_data, 2, var) - var_ratio <- max(item_vars) / min(item_vars) - cat("Ratio of highest to lowest item variance:", round(var_ratio, 4), "\n") - - return(alpha_result) -} - -# Calculate Cronbach's alpha for all scales -cat("CRONBACH'S ALPHA RELIABILITY ANALYSIS") -cat("\nData: exp1.csv") -cat("\nTotal sample size:", nrow(data)) - -# Past time point analyses -past_pref_alpha <- calc_cronbach_alpha(data, past_pref_vars, "Past Preferences") -past_pers_alpha <- calc_cronbach_alpha(data, past_pers_vars, "Past Personality") -past_val_alpha <- calc_cronbach_alpha(data, past_val_vars, "Past Values") -past_life_alpha <- calc_cronbach_alpha(data, past_life_vars, "Past Life Satisfaction") - -# Future time point analyses -fut_pref_alpha <- calc_cronbach_alpha(data, fut_pref_vars, "Future Preferences") -fut_pers_alpha <- calc_cronbach_alpha(data, fut_pers_vars, "Future Personality") -fut_val_alpha <- calc_cronbach_alpha(data, fut_val_vars, "Future Values") -fut_life_alpha <- calc_cronbach_alpha(data, fut_life_vars, "Future Life Satisfaction") - -# Summary table -cat("\n", "="*80, "\n") -cat("SUMMARY OF CRONBACH'S ALPHA COEFFICIENTS") -cat("\n", "="*80, "\n") - -summary_results <- data.frame( - Scale = c("Past Preferences", "Past Personality", "Past Values", "Past Life Satisfaction", - "Future Preferences", "Future Personality", "Future Values", "Future Life Satisfaction"), - Alpha = c( - if(!is.null(past_pref_alpha)) round(past_pref_alpha$total$raw_alpha, 4) else NA, - if(!is.null(past_pers_alpha)) round(past_pers_alpha$total$raw_alpha, 4) else NA, - if(!is.null(past_val_alpha)) round(past_val_alpha$total$raw_alpha, 4) else NA, - if(!is.null(past_life_alpha)) round(past_life_alpha$total$raw_alpha, 4) else NA, - if(!is.null(fut_pref_alpha)) round(fut_pref_alpha$total$raw_alpha, 4) else NA, - if(!is.null(fut_pers_alpha)) round(fut_pers_alpha$total$raw_alpha, 4) else NA, - if(!is.null(fut_val_alpha)) round(fut_val_alpha$total$raw_alpha, 4) else NA, - if(!is.null(fut_life_alpha)) round(fut_life_alpha$total$raw_alpha, 4) else NA - ), - Items = rep(5, 8), - Interpretation = c( - if(!is.null(past_pref_alpha)) { - alpha_val <- past_pref_alpha$total$raw_alpha - if(alpha_val >= 0.90) "Excellent" else if(alpha_val >= 0.80) "Good" else if(alpha_val >= 0.70) "Acceptable" else if(alpha_val >= 0.60) "Questionable" else "Poor" - } else "Insufficient data", - if(!is.null(past_pers_alpha)) { - alpha_val <- past_pers_alpha$total$raw_alpha - if(alpha_val >= 0.90) "Excellent" else if(alpha_val >= 0.80) "Good" else if(alpha_val >= 0.70) "Acceptable" else if(alpha_val >= 0.60) "Questionable" else "Poor" - } else "Insufficient data", - if(!is.null(past_val_alpha)) { - alpha_val <- past_val_alpha$total$raw_alpha - if(alpha_val >= 0.90) "Excellent" else if(alpha_val >= 0.80) "Good" else if(alpha_val >= 0.70) "Acceptable" else if(alpha_val >= 0.60) "Questionable" else "Poor" - } else "Insufficient data", - if(!is.null(past_life_alpha)) { - alpha_val <- past_life_alpha$total$raw_alpha - if(alpha_val >= 0.90) "Excellent" else if(alpha_val >= 0.80) "Good" else if(alpha_val >= 0.70) "Acceptable" else if(alpha_val >= 0.60) "Questionable" else "Poor" - } else "Insufficient data", - if(!is.null(fut_pref_alpha)) { - alpha_val <- fut_pref_alpha$total$raw_alpha - if(alpha_val >= 0.90) "Excellent" else if(alpha_val >= 0.80) "Good" else if(alpha_val >= 0.70) "Acceptable" else if(alpha_val >= 0.60) "Questionable" else "Poor" - } else "Insufficient data", - if(!is.null(fut_pers_alpha)) { - alpha_val <- fut_pers_alpha$total$raw_alpha - if(alpha_val >= 0.90) "Excellent" else if(alpha_val >= 0.80) "Good" else if(alpha_val >= 0.70) "Acceptable" else if(alpha_val >= 0.60) "Questionable" else "Poor" - } else "Insufficient data", - if(!is.null(fut_val_alpha)) { - alpha_val <- fut_val_alpha$total$raw_alpha - if(alpha_val >= 0.90) "Excellent" else if(alpha_val >= 0.80) "Good" else if(alpha_val >= 0.70) "Acceptable" else if(alpha_val >= 0.60) "Questionable" else "Poor" - } else "Insufficient data", - if(!is.null(fut_life_alpha)) { - alpha_val <- fut_life_alpha$total$raw_alpha - if(alpha_val >= 0.90) "Excellent" else if(alpha_val >= 0.80) "Good" else if(alpha_val >= 0.70) "Acceptable" else if(alpha_val >= 0.60) "Questionable" else "Poor" - } else "Insufficient data" - ) -) - -print(summary_results) - -# Save results to CSV -write.csv(summary_results, "cronbach_alpha_summary.csv", row.names = FALSE) -cat("\nSummary results saved to: cronbach_alpha_summary.csv\n") - -cat("\n", "="*80, "\n") -cat("INTERPRETATION GUIDE FOR CRONBACH'S ALPHA") -cat("\n", "="*80, "\n") -cat("α ≥ 0.90: Excellent reliability\n") -cat("α ≥ 0.80: Good reliability\n") -cat("α ≥ 0.70: Acceptable reliability\n") -cat("α ≥ 0.60: Questionable reliability\n") -cat("α < 0.60: Poor reliability\n") diff --git a/.history/eohi1/reliability_analysis_cronbach_alpha_20250917154729.r b/.history/eohi1/reliability_analysis_cronbach_alpha_20250917154729.r deleted file mode 100644 index a12e156..0000000 --- a/.history/eohi1/reliability_analysis_cronbach_alpha_20250917154729.r +++ /dev/null @@ -1,202 +0,0 @@ -# Cronbach's Alpha Reliability Analysis -# For 4 domains (preferences, personality, values, life satisfaction) at 2 time points -# 5 items per domain per time point - -# Load required libraries -library(psych) -library(dplyr) -library(tidyr) - -# Read the data -data <- read.csv("exp1.csv") - -# Define the scale variables for each domain and time point -# Past time point scales -past_pref_vars <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", - "NPastDiff_pref_nap", "NPastDiff_pref_travel") - -past_pers_vars <- c("NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", - "NPastDiff_pers_anxious", "NPastDiff_pers_complex") - -past_val_vars <- c("NPastDiff_val_obey", "NPastDiff_val_trad", "NPastDiff_val_opinion", - "NPastDiff_val_performance", "NPastDiff_val_justice") - -past_life_vars <- c("NPastDiff_life_ideal", "NPastDiff_life_excellent", "NPastDiff_life_satisfied", - "NPastDiff_life_important", "NPastDiff_life_change") - -# Future time point scales -fut_pref_vars <- c("NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", - "NFutDiff_pref_nap", "NFutDiff_pref_travel") - -fut_pers_vars <- c("NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", - "NFutDiff_pers_anxious", "NFutDiff_pers_complex") - -fut_val_vars <- c("NFutDiff_val_obey", "NFutDiff_val_trad", "NFutDiff_val_opinion", - "NFutDiff_val_performance", "NFutDiff_val_justice") - -fut_life_vars <- c("NFutDiff_life_ideal", "NFutDiff_life_excellent", "NFutDiff_life_satisfied", - "NFutDiff_life_important", "NFutDiff_life_change") - -# Function to calculate Cronbach's alpha and return detailed results -calc_cronbach_alpha <- function(data, var_names, scale_name) { - # Check for missing values - scale_data <- data[, var_names] - missing_info <- data.frame( - Variable = var_names, - Missing_Count = colSums(is.na(scale_data)), - Missing_Percent = round(colSums(is.na(scale_data)) / nrow(scale_data) * 100, 2) - ) - - # Remove rows with any missing values for reliability analysis - complete_data <- scale_data[complete.cases(scale_data), ] - - cat("\n", "="*60, "\n") - cat("SCALE:", scale_name, "\n") - cat("="*60, "\n") - - cat("Sample size for reliability analysis:", nrow(complete_data), "\n") - cat("Original sample size:", nrow(data), "\n") - cat("Cases removed due to missing data:", nrow(data) - nrow(complete_data), "\n\n") - - cat("Missing data summary:\n") - print(missing_info) - - if(nrow(complete_data) < 3) { - cat("\nWARNING: Insufficient complete cases for reliability analysis\n") - return(NULL) - } - - # Calculate Cronbach's alpha - alpha_result <- alpha(complete_data, check.keys = TRUE) - - cat("\nCronbach's Alpha Results:\n") - cat("Raw alpha:", round(alpha_result$total$raw_alpha, 4), "\n") - cat("Standardized alpha:", round(alpha_result$total$std.alpha, 4), "\n") - cat("Average inter-item correlation:", round(alpha_result$total$average_r, 4), "\n") - cat("Number of items:", alpha_result$total$nvar, "\n") - - # Item statistics - cat("\nItem Statistics:\n") - item_stats <- data.frame( - Item = var_names, - Alpha_if_deleted = round(alpha_result$alpha.drop$raw_alpha, 4), - Item_total_correlation = round(alpha_result$item.stats$r.drop, 4), - Mean = round(alpha_result$item.stats$mean, 4), - SD = round(alpha_result$item.stats$sd, 4) - ) - print(item_stats) - - # Check assumptions - cat("\nAssumption Checks:\n") - - # 1. Check for sufficient sample size (minimum 30 recommended) - sample_size_ok <- nrow(complete_data) >= 30 - cat("Sample size adequate (≥30):", sample_size_ok, "\n") - - # 2. Check for adequate inter-item correlations (should be > 0.30) - inter_item_cors <- cor(complete_data) - inter_item_cors[lower.tri(inter_item_cors)] <- NA - diag(inter_item_cors) <- NA - inter_item_cors_flat <- as.vector(inter_item_cors) - inter_item_cors_flat <- inter_item_cors_flat[!is.na(inter_item_cors_flat)] - adequate_cors <- sum(inter_item_cors_flat > 0.30) / length(inter_item_cors_flat) - cat("Proportion of inter-item correlations > 0.30:", round(adequate_cors, 4), "\n") - - # 3. Check for negative correlations (concerning for unidimensionality) - negative_cors <- sum(inter_item_cors_flat < 0, na.rm = TRUE) - cat("Number of negative inter-item correlations:", negative_cors, "\n") - - # 4. Check item variances (should be roughly similar) - item_vars <- apply(complete_data, 2, var) - var_ratio <- max(item_vars) / min(item_vars) - cat("Ratio of highest to lowest item variance:", round(var_ratio, 4), "\n") - - return(alpha_result) -} - -# Calculate Cronbach's alpha for all scales -cat("CRONBACH'S ALPHA RELIABILITY ANALYSIS") -cat("\nData: exp1.csv") -cat("\nTotal sample size:", nrow(data)) - -# Past time point analyses -past_pref_alpha <- calc_cronbach_alpha(data, past_pref_vars, "Past Preferences") -past_pers_alpha <- calc_cronbach_alpha(data, past_pers_vars, "Past Personality") -past_val_alpha <- calc_cronbach_alpha(data, past_val_vars, "Past Values") -past_life_alpha <- calc_cronbach_alpha(data, past_life_vars, "Past Life Satisfaction") - -# Future time point analyses -fut_pref_alpha <- calc_cronbach_alpha(data, fut_pref_vars, "Future Preferences") -fut_pers_alpha <- calc_cronbach_alpha(data, fut_pers_vars, "Future Personality") -fut_val_alpha <- calc_cronbach_alpha(data, fut_val_vars, "Future Values") -fut_life_alpha <- calc_cronbach_alpha(data, fut_life_vars, "Future Life Satisfaction") - -# Summary table -cat("\n", "="*80, "\n") -cat("SUMMARY OF CRONBACH'S ALPHA COEFFICIENTS") -cat("\n", "="*80, "\n") - -summary_results <- data.frame( - Scale = c("Past Preferences", "Past Personality", "Past Values", "Past Life Satisfaction", - "Future Preferences", "Future Personality", "Future Values", "Future Life Satisfaction"), - Alpha = c( - if(!is.null(past_pref_alpha)) round(past_pref_alpha$total$raw_alpha, 4) else NA, - if(!is.null(past_pers_alpha)) round(past_pers_alpha$total$raw_alpha, 4) else NA, - if(!is.null(past_val_alpha)) round(past_val_alpha$total$raw_alpha, 4) else NA, - if(!is.null(past_life_alpha)) round(past_life_alpha$total$raw_alpha, 4) else NA, - if(!is.null(fut_pref_alpha)) round(fut_pref_alpha$total$raw_alpha, 4) else NA, - if(!is.null(fut_pers_alpha)) round(fut_pers_alpha$total$raw_alpha, 4) else NA, - if(!is.null(fut_val_alpha)) round(fut_val_alpha$total$raw_alpha, 4) else NA, - if(!is.null(fut_life_alpha)) round(fut_life_alpha$total$raw_alpha, 4) else NA - ), - Items = rep(5, 8), - Interpretation = c( - if(!is.null(past_pref_alpha)) { - alpha_val <- past_pref_alpha$total$raw_alpha - if(alpha_val >= 0.90) "Excellent" else if(alpha_val >= 0.80) "Good" else if(alpha_val >= 0.70) "Acceptable" else if(alpha_val >= 0.60) "Questionable" else "Poor" - } else "Insufficient data", - if(!is.null(past_pers_alpha)) { - alpha_val <- past_pers_alpha$total$raw_alpha - if(alpha_val >= 0.90) "Excellent" else if(alpha_val >= 0.80) "Good" else if(alpha_val >= 0.70) "Acceptable" else if(alpha_val >= 0.60) "Questionable" else "Poor" - } else "Insufficient data", - if(!is.null(past_val_alpha)) { - alpha_val <- past_val_alpha$total$raw_alpha - if(alpha_val >= 0.90) "Excellent" else if(alpha_val >= 0.80) "Good" else if(alpha_val >= 0.70) "Acceptable" else if(alpha_val >= 0.60) "Questionable" else "Poor" - } else "Insufficient data", - if(!is.null(past_life_alpha)) { - alpha_val <- past_life_alpha$total$raw_alpha - if(alpha_val >= 0.90) "Excellent" else if(alpha_val >= 0.80) "Good" else if(alpha_val >= 0.70) "Acceptable" else if(alpha_val >= 0.60) "Questionable" else "Poor" - } else "Insufficient data", - if(!is.null(fut_pref_alpha)) { - alpha_val <- fut_pref_alpha$total$raw_alpha - if(alpha_val >= 0.90) "Excellent" else if(alpha_val >= 0.80) "Good" else if(alpha_val >= 0.70) "Acceptable" else if(alpha_val >= 0.60) "Questionable" else "Poor" - } else "Insufficient data", - if(!is.null(fut_pers_alpha)) { - alpha_val <- fut_pers_alpha$total$raw_alpha - if(alpha_val >= 0.90) "Excellent" else if(alpha_val >= 0.80) "Good" else if(alpha_val >= 0.70) "Acceptable" else if(alpha_val >= 0.60) "Questionable" else "Poor" - } else "Insufficient data", - if(!is.null(fut_val_alpha)) { - alpha_val <- fut_val_alpha$total$raw_alpha - if(alpha_val >= 0.90) "Excellent" else if(alpha_val >= 0.80) "Good" else if(alpha_val >= 0.70) "Acceptable" else if(alpha_val >= 0.60) "Questionable" else "Poor" - } else "Insufficient data", - if(!is.null(fut_life_alpha)) { - alpha_val <- fut_life_alpha$total$raw_alpha - if(alpha_val >= 0.90) "Excellent" else if(alpha_val >= 0.80) "Good" else if(alpha_val >= 0.70) "Acceptable" else if(alpha_val >= 0.60) "Questionable" else "Poor" - } else "Insufficient data" - ) -) - -print(summary_results) - -# Save results to CSV -write.csv(summary_results, "cronbach_alpha_summary.csv", row.names = FALSE) -cat("\nSummary results saved to: cronbach_alpha_summary.csv\n") - -cat("\n", "="*80, "\n") -cat("INTERPRETATION GUIDE FOR CRONBACH'S ALPHA") -cat("\n", "="*80, "\n") -cat("α ≥ 0.90: Excellent reliability\n") -cat("α ≥ 0.80: Good reliability\n") -cat("α ≥ 0.70: Acceptable reliability\n") -cat("α ≥ 0.60: Questionable reliability\n") -cat("α < 0.60: Poor reliability\n") diff --git a/.history/eohi1/reliability_analysis_cronbach_alpha_20250918120701.r b/.history/eohi1/reliability_analysis_cronbach_alpha_20250918120701.r deleted file mode 100644 index f3fa96f..0000000 --- a/.history/eohi1/reliability_analysis_cronbach_alpha_20250918120701.r +++ /dev/null @@ -1,204 +0,0 @@ -# Cronbach's Alpha Reliability Analysis -# For 4 domains (preferences, personality, values, life satisfaction) at 2 time points -# 5 items per domain per time point - -# Load required libraries -library(psych) -library(dplyr) -library(tidyr) - -# Read the data -data <- read.csv("exp1.csv") - -# Define the scale variables for each domain and time point -# Past time point scales -past_pref_vars <- c("NPastDiff_pref_read", "NPastDiff_pref_music", "NPastDiff_pref_tv", - "NPastDiff_pref_nap", "NPastDiff_pref_travel") - -past_pers_vars <- c("NPastDiff_pers_extravert", "NPastDiff_pers_critical", "NPastDiff_pers_dependable", - "NPastDiff_pers_anxious", "NPastDiff_pers_complex") - -past_val_vars <- c("NPastDiff_val_obey", "NPastDiff_val_trad", "NPastDiff_val_opinion", - "NPastDiff_val_performance", "NPastDiff_val_justice") - -past_life_vars <- c("NPastDiff_life_ideal", "NPastDiff_life_excellent", "NPastDiff_life_satisfied", - "NPastDiff_life_important", "NPastDiff_life_change") - -# Future time point scales -fut_pref_vars <- c("NFutDiff_pref_read", "NFutDiff_pref_music", "NFutDiff_pref_tv", - "NFutDiff_pref_nap", "NFutDiff_pref_travel") - -fut_pers_vars <- c("NFutDiff_pers_extravert", "NFutDiff_pers_critical", "NFutDiff_pers_dependable", - "NFutDiff_pers_anxious", "NFutDiff_pers_complex") - -fut_val_vars <- c("NFutDiff_val_obey", "NFutDiff_val_trad", "NFutDiff_val_opinion", - "NFutDiff_val_performance", "NFutDiff_val_justice") - -fut_life_vars <- c("NFutDiff_life_ideal", "NFutDiff_life_excellent", "NFutDiff_life_satisfied", - "NFutDiff_life_important", "NFutDiff_life_change") - -# Function to calculate Cronbach's alpha and return detailed results -calc_cronbach_alpha <- function(data, var_names, scale_name) { - # Check for missing values - scale_data <- data[, var_names] - missing_info <- data.frame( - Variable = var_names, - Missing_Count = colSums(is.na(scale_data)), - Missing_Percent = round(colSums(is.na(scale_data)) / nrow(scale_data) * 100, 2) - ) - - # Remove rows with any missing values for reliability analysis - complete_data <- scale_data[complete.cases(scale_data), ] - - cat("\n", "="*60, "\n") - cat("SCALE:", scale_name, "\n") - cat("="*60, "\n") - - cat("Sample size for reliability analysis:", nrow(complete_data), "\n") - cat("Original sample size:", nrow(data), "\n") - cat("Cases removed due to missing data:", nrow(data) - nrow(complete_data), "\n\n") - - cat("Missing data summary:\n") - print(missing_info) - - if(nrow(complete_data) < 3) { - cat("\nWARNING: Insufficient complete cases for reliability analysis\n") - return(NULL) - } - - # Calculate Cronbach's alpha - alpha_result <- alpha(complete_data, check.keys = TRUE) - - cat("\nCronbach's Alpha Results:\n") - cat("Raw alpha:", round(alpha_result$total$raw_alpha, 4), "\n") - cat("Standardized alpha:", round(alpha_result$total$std.alpha, 4), "\n") - cat("Average inter-item correlation:", round(alpha_result$total$average_r, 4), "\n") - cat("Number of items:", alpha_result$total$nvar, "\n") - - # Item statistics - cat("\nItem Statistics:\n") - item_stats <- data.frame( - Item = var_names, - Alpha_if_deleted = round(alpha_result$alpha.drop$raw_alpha, 4), - Item_total_correlation = round(alpha_result$item.stats$r.drop, 4), - Mean = round(alpha_result$item.stats$mean, 4), - SD = round(alpha_result$item.stats$sd, 4) - ) - print(item_stats) - - # Check assumptions - cat("\nAssumption Checks:\n") - - # 1. Check for sufficient sample size (minimum 30 recommended) - sample_size_ok <- nrow(complete_data) >= 30 - cat("Sample size adequate (≥30):", sample_size_ok, "\n") - - # 2. Check for adequate inter-item correlations (should be > 0.30) - inter_item_cors <- cor(complete_data) - inter_item_cors[lower.tri(inter_item_cors)] <- NA - diag(inter_item_cors) <- NA - inter_item_cors_flat <- as.vector(inter_item_cors) - inter_item_cors_flat <- inter_item_cors_flat[!is.na(inter_item_cors_flat)] - adequate_cors <- sum(inter_item_cors_flat > 0.30) / length(inter_item_cors_flat) - cat("Proportion of inter-item correlations > 0.30:", round(adequate_cors, 4), "\n") - - # 3. Check for negative correlations (concerning for unidimensionality) - negative_cors <- sum(inter_item_cors_flat < 0, na.rm = TRUE) - cat("Number of negative inter-item correlations:", negative_cors, "\n") - - # 4. Check item variances (should be roughly similar) - item_vars <- apply(complete_data, 2, var) - var_ratio <- max(item_vars) / min(item_vars) - cat("Ratio of highest to lowest item variance:", round(var_ratio, 4), "\n") - - return(alpha_result) -} - -# Calculate Cronbach's alpha for all scales -cat("CRONBACH'S ALPHA RELIABILITY ANALYSIS") -cat("\nData: exp1.csv") -cat("\nTotal sample size:", nrow(data)) - -# Past time point analyses -past_pref_alpha <- calc_cronbach_alpha(data, past_pref_vars, "Past Preferences") -past_pers_alpha <- calc_cronbach_alpha(data, past_pers_vars, "Past Personality") -past_val_alpha <- calc_cronbach_alpha(data, past_val_vars, "Past Values") -past_life_alpha <- calc_cronbach_alpha(data, past_life_vars, "Past Life Satisfaction") - -# Future time point analyses -fut_pref_alpha <- calc_cronbach_alpha(data, fut_pref_vars, "Future Preferences") -fut_pers_alpha <- calc_cronbach_alpha(data, fut_pers_vars, "Future Personality") -fut_val_alpha <- calc_cronbach_alpha(data, fut_val_vars, "Future Values") -fut_life_alpha <- calc_cronbach_alpha(data, fut_life_vars, "Future Life Satisfaction") - -# Summary table -cat("\n", "="*80, "\n") -cat("SUMMARY OF CRONBACH'S ALPHA COEFFICIENTS") -cat("\n", "="*80, "\n") - -summary_results <- data.frame( - Scale = c("Past Preferences", "Past Personality", "Past Values", "Past Life Satisfaction", - "Future Preferences", "Future Personality", "Future Values", "Future Life Satisfaction"), - Alpha = c( - if(!is.null(past_pref_alpha)) round(past_pref_alpha$total$raw_alpha, 4) else NA, - if(!is.null(past_pers_alpha)) round(past_pers_alpha$total$raw_alpha, 4) else NA, - if(!is.null(past_val_alpha)) round(past_val_alpha$total$raw_alpha, 4) else NA, - if(!is.null(past_life_alpha)) round(past_life_alpha$total$raw_alpha, 4) else NA, - if(!is.null(fut_pref_alpha)) round(fut_pref_alpha$total$raw_alpha, 4) else NA, - if(!is.null(fut_pers_alpha)) round(fut_pers_alpha$total$raw_alpha, 4) else NA, - if(!is.null(fut_val_alpha)) round(fut_val_alpha$total$raw_alpha, 4) else NA, - if(!is.null(fut_life_alpha)) round(fut_life_alpha$total$raw_alpha, 4) else NA - ), - Items = rep(5, 8), - Interpretation = c( - if(!is.null(past_pref_alpha)) { - alpha_val <- past_pref_alpha$total$raw_alpha - if(alpha_val >= 0.90) "Excellent" else if(alpha_val >= 0.80) "Good" else if(alpha_val >= 0.70) "Acceptable" else if(alpha_val >= 0.60) "Questionable" else "Poor" - } else "Insufficient data", - if(!is.null(past_pers_alpha)) { - alpha_val <- past_pers_alpha$total$raw_alpha - if(alpha_val >= 0.90) "Excellent" else if(alpha_val >= 0.80) "Good" else if(alpha_val >= 0.70) "Acceptable" else if(alpha_val >= 0.60) "Questionable" else "Poor" - } else "Insufficient data", - if(!is.null(past_val_alpha)) { - alpha_val <- past_val_alpha$total$raw_alpha - if(alpha_val >= 0.90) "Excellent" else if(alpha_val >= 0.80) "Good" else if(alpha_val >= 0.70) "Acceptable" else if(alpha_val >= 0.60) "Questionable" else "Poor" - } else "Insufficient data", - if(!is.null(past_life_alpha)) { - alpha_val <- past_life_alpha$total$raw_alpha - if(alpha_val >= 0.90) "Excellent" else if(alpha_val >= 0.80) "Good" else if(alpha_val >= 0.70) "Acceptable" else if(alpha_val >= 0.60) "Questionable" else "Poor" - } else "Insufficient data", - if(!is.null(fut_pref_alpha)) { - alpha_val <- fut_pref_alpha$total$raw_alpha - if(alpha_val >= 0.90) "Excellent" else if(alpha_val >= 0.80) "Good" else if(alpha_val >= 0.70) "Acceptable" else if(alpha_val >= 0.60) "Questionable" else "Poor" - } else "Insufficient data", - if(!is.null(fut_pers_alpha)) { - alpha_val <- fut_pers_alpha$total$raw_alpha - if(alpha_val >= 0.90) "Excellent" else if(alpha_val >= 0.80) "Good" else if(alpha_val >= 0.70) "Acceptable" else if(alpha_val >= 0.60) "Questionable" else "Poor" - } else "Insufficient data", - if(!is.null(fut_val_alpha)) { - alpha_val <- fut_val_alpha$total$raw_alpha - if(alpha_val >= 0.90) "Excellent" else if(alpha_val >= 0.80) "Good" else if(alpha_val >= 0.70) "Acceptable" else if(alpha_val >= 0.60) "Questionable" else "Poor" - } else "Insufficient data", - if(!is.null(fut_life_alpha)) { - alpha_val <- fut_life_alpha$total$raw_alpha - if(alpha_val >= 0.90) "Excellent" else if(alpha_val >= 0.80) "Good" else if(alpha_val >= 0.70) "Acceptable" else if(alpha_val >= 0.60) "Questionable" else "Poor" - } else "Insufficient data" - ) -) - -print(summary_results) - -# Save results to CSV -write.csv(summary_results, "cronbach_alpha_summary.csv", row.names = FALSE) -cat("\nSummary results saved to: cronbach_alpha_summary.csv\n") - -cat("\n", "="*80, "\n") -cat("INTERPRETATION GUIDE FOR CRONBACH'S ALPHA") -cat("\n", "="*80, "\n") -cat("α ≥ 0.90: Excellent reliability\n") -cat("α ≥ 0.80: Good reliability\n") -cat("α ≥ 0.70: Acceptable reliability\n") -cat("α ≥ 0.60: Questionable reliability\n") -cat("α < 0.60: Poor reliability\n") - - diff --git a/.history/eohi1/test_knit_20251004194422.rmd b/.history/eohi1/test_knit_20251004194422.rmd deleted file mode 100644 index 3f9d533..0000000 --- a/.history/eohi1/test_knit_20251004194422.rmd +++ /dev/null @@ -1,22 +0,0 @@ ---- -title: "Test Knit" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -# Test - -This is a test to see if knitting works. - -```{r test} -library(tidyverse) -print("Libraries loaded successfully") -``` - -```{r test-data} -data <- read.csv("exp1.csv") -print(paste("Data loaded:", nrow(data), "rows")) -``` diff --git a/.history/eohi1/test_knit_20251004194431.rmd b/.history/eohi1/test_knit_20251004194431.rmd deleted file mode 100644 index 3f9d533..0000000 --- a/.history/eohi1/test_knit_20251004194431.rmd +++ /dev/null @@ -1,22 +0,0 @@ ---- -title: "Test Knit" -output: html_document ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -# Test - -This is a test to see if knitting works. - -```{r test} -library(tidyverse) -print("Libraries loaded successfully") -``` - -```{r test-data} -data <- read.csv("exp1.csv") -print(paste("Data loaded:", nrow(data), "rows")) -``` diff --git a/.history/eohi1/test_knit_20251004194642.rmd b/.history/eohi1/test_knit_20251004194642.rmd deleted file mode 100644 index 0519ecb..0000000 --- a/.history/eohi1/test_knit_20251004194642.rmd +++ /dev/null @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/.history/eohi2/README_Variable_Creation_20251001133606.txt b/.history/eohi2/README_Variable_Creation_20251001133606.txt deleted file mode 100644 index 4029a3e..0000000 --- a/.history/eohi2/README_Variable_Creation_20251001133606.txt +++ /dev/null @@ -1,425 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 06 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 184 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (16 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Mean scores: 4 (1 per time period) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 06) as later scripts depend - on variables created by earlier scripts. - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with random row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04 and 06 have commented-out write commands for review - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 1, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251001133614.txt b/.history/eohi2/README_Variable_Creation_20251001133614.txt deleted file mode 100644 index 4029a3e..0000000 --- a/.history/eohi2/README_Variable_Creation_20251001133614.txt +++ /dev/null @@ -1,425 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 06 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 184 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (16 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Mean scores: 4 (1 per time period) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 06) as later scripts depend - on variables created by earlier scripts. - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with random row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04 and 06 have commented-out write commands for review - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 1, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251001133615.txt b/.history/eohi2/README_Variable_Creation_20251001133615.txt deleted file mode 100644 index 4029a3e..0000000 --- a/.history/eohi2/README_Variable_Creation_20251001133615.txt +++ /dev/null @@ -1,425 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 06 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 184 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (16 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Mean scores: 4 (1 per time period) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 06) as later scripts depend - on variables created by earlier scripts. - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with random row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04 and 06 have commented-out write commands for review - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 1, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251001133634.txt b/.history/eohi2/README_Variable_Creation_20251001133634.txt deleted file mode 100644 index 4029a3e..0000000 --- a/.history/eohi2/README_Variable_Creation_20251001133634.txt +++ /dev/null @@ -1,425 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 06 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 184 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (16 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Mean scores: 4 (1 per time period) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 06) as later scripts depend - on variables created by earlier scripts. - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with random row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04 and 06 have commented-out write commands for review - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 1, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251001154337.txt b/.history/eohi2/README_Variable_Creation_20251001154337.txt deleted file mode 100644 index 81ff966..0000000 --- a/.history/eohi2/README_Variable_Creation_20251001154337.txt +++ /dev/null @@ -1,425 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 07 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 184 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (16 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Mean scores: 4 (1 per time period) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 06) as later scripts depend - on variables created by earlier scripts. - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with random row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04 and 06 have commented-out write commands for review - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 1, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251001154405.txt b/.history/eohi2/README_Variable_Creation_20251001154405.txt deleted file mode 100644 index 097b4d1..0000000 --- a/.history/eohi2/README_Variable_Creation_20251001154405.txt +++ /dev/null @@ -1,503 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 07 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 184 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (16 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Mean scores: 4 (1 per time period) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 06) as later scripts depend - on variables created by earlier scripts. - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with random row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04 and 06 have commented-out write commands for review - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 1, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251001154412.txt b/.history/eohi2/README_Variable_Creation_20251001154412.txt deleted file mode 100644 index 6499977..0000000 --- a/.history/eohi2/README_Variable_Creation_20251001154412.txt +++ /dev/null @@ -1,504 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 07 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 202 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (16 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Mean scores: 4 (1 per time period) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 06) as later scripts depend - on variables created by earlier scripts. - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with random row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04 and 06 have commented-out write commands for review - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 1, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251001154419.txt b/.history/eohi2/README_Variable_Creation_20251001154419.txt deleted file mode 100644 index 236fe07..0000000 --- a/.history/eohi2/README_Variable_Creation_20251001154419.txt +++ /dev/null @@ -1,512 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 07 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 202 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (16 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Mean scores: 4 (1 per time period) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 06) as later scripts depend - on variables created by earlier scripts. - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with random row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04 and 06 have commented-out write commands for review - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 1, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251001154424.txt b/.history/eohi2/README_Variable_Creation_20251001154424.txt deleted file mode 100644 index 43bd090..0000000 --- a/.history/eohi2/README_Variable_Creation_20251001154424.txt +++ /dev/null @@ -1,512 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 07 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 202 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (16 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Mean scores: 4 (1 per time period) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 07) as later scripts depend - on variables created by earlier scripts. - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with random row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04 and 06 have commented-out write commands for review - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 1, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251001154430.txt b/.history/eohi2/README_Variable_Creation_20251001154430.txt deleted file mode 100644 index 1cc72e7..0000000 --- a/.history/eohi2/README_Variable_Creation_20251001154430.txt +++ /dev/null @@ -1,512 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 07 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 202 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (16 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Mean scores: 4 (1 per time period) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 07) as later scripts depend - on variables created by earlier scripts. - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with random row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 1, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251001154444.txt b/.history/eohi2/README_Variable_Creation_20251001154444.txt deleted file mode 100644 index 1cc72e7..0000000 --- a/.history/eohi2/README_Variable_Creation_20251001154444.txt +++ /dev/null @@ -1,512 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 07 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 202 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (16 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Mean scores: 4 (1 per time period) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 07) as later scripts depend - on variables created by earlier scripts. - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with random row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 1, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251001155104.txt b/.history/eohi2/README_Variable_Creation_20251001155104.txt deleted file mode 100644 index 1cc72e7..0000000 --- a/.history/eohi2/README_Variable_Creation_20251001155104.txt +++ /dev/null @@ -1,512 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 07 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 202 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (16 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Mean scores: 4 (1 per time period) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 07) as later scripts depend - on variables created by earlier scripts. - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with random row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 1, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251001155126.txt b/.history/eohi2/README_Variable_Creation_20251001155126.txt deleted file mode 100644 index 1cc72e7..0000000 --- a/.history/eohi2/README_Variable_Creation_20251001155126.txt +++ /dev/null @@ -1,512 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 07 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 202 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (16 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Mean scores: 4 (1 per time period) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 07) as later scripts depend - on variables created by earlier scripts. - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with random row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 1, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251008114335.txt b/.history/eohi2/README_Variable_Creation_20251008114335.txt deleted file mode 100644 index e6fee3a..0000000 --- a/.history/eohi2/README_Variable_Creation_20251008114335.txt +++ /dev/null @@ -1,650 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 07 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SCRIPT 08: dataP 08 - DGEN 510 vars.r -================================================================================ - -PURPOSE: - Calculates absolute differences between 5-year and 10-year DGEN ratings for - both Past and Future time directions. These variables measure the perceived - difference in domain-general change between the two time intervals. - -VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - Total: 12 DGEN columns (created in Script 03) - -TARGET VARIABLES: - Past Direction (3 variables): - - X5_10DGEN_past_pref (|DGEN_past_5_Pref - DGEN_past_10_Pref|) - - X5_10DGEN_past_pers (|DGEN_past_5_Pers - DGEN_past_10_Pers|) - - X5_10DGEN_past_val (|DGEN_past_5_Val - DGEN_past_10_Val|) - - Future Direction (3 variables): - - X5_10DGEN_fut_pref (|DGEN_fut_5_Pref - DGEN_fut_10_Pref|) - - X5_10DGEN_fut_pers (|DGEN_fut_5_Pers - DGEN_fut_10_Pers|) - - X5_10DGEN_fut_val (|DGEN_fut_5_Val - DGEN_fut_10_Val|) - -TRANSFORMATION LOGIC: - Formula: |DGEN_5 - DGEN_10| - - All calculations use absolute differences: - - Past Preferences: |DGEN_past_5_Pref - DGEN_past_10_Pref| - - Past Personality: |DGEN_past_5_Pers - DGEN_past_10_Pers| - - Past Values: |DGEN_past_5_Val - DGEN_past_10_Val| - - Future Preferences: |DGEN_fut_5_Pref - DGEN_fut_10_Pref| - - Future Personality: |DGEN_fut_5_Pers - DGEN_fut_10_Pers| - - Future Values: |DGEN_fut_5_Val - DGEN_fut_10_Val| - - Result: Always positive values representing magnitude of difference - Missing values in either source column result in NA - -SPECIAL NOTES: - - Variable names use "X" prefix because R automatically adds it to column - names starting with numbers (5_10 becomes X5_10) - - This script depends on Script 03 being run first - - Measures interval effects within time direction (past vs future) - - Parallel to Script 06's 5.10past and 5.10fut variables but for DGEN scores - - -================================================================================ -SCRIPT 09: dataP 09 - interval x direction means.r -================================================================================ - -PURPOSE: - Calculates comprehensive mean scores by averaging item-level differences - across intervals and directions. Creates both narrow-scope means (single - time interval) and broad-scope global means (combining multiple intervals). - -VARIABLES CREATED: 11 total (6 narrow-scope + 5 global-scope) - -SOURCE COLUMNS: - All 90 difference variables created in Script 06: - - NPast_5_[domain]_[item] (15 variables) - - NPast_10_[domain]_[item] (15 variables) - - NFut_5_[domain]_[item] (15 variables) - - NFut_10_[domain]_[item] (15 variables) - - X5.10past_[domain]_[item] (15 variables) - - X5.10fut_[domain]_[item] (15 variables) - -TARGET VARIABLES: - - Narrow-Scope Means (15 source items each): - - NPast_5_mean (mean across all 15 NPast_5 items) - - NPast_10_mean (mean across all 15 NPast_10 items) - - NFut_5_mean (mean across all 15 NFut_5 items) - - NFut_10_mean (mean across all 15 NFut_10 items) - - X5.10past_mean (mean across all 15 X5.10past items) - - X5.10fut_mean (mean across all 15 X5.10fut items) - - Global-Scope Means (30 source items each): - - NPast_global_mean (NPast_5 + NPast_10: all past intervals) - - NFut_global_mean (NFut_5 + NFut_10: all future intervals) - - X5.10_global_mean (X5.10past + X5.10fut: all 5-vs-10 intervals) - - N5_global_mean (NPast_5 + NFut_5: all 5-year intervals) - - N10_global_mean (NPast_10 + NFut_10: all 10-year intervals) - -TRANSFORMATION LOGIC: - - Narrow-Scope Means (15 items each): - Each mean averages all 15 difference items within one time interval - - Example for NPast_5_mean: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel, - NPast_5_pers_extravert, NPast_5_pers_critical, - NPast_5_pers_dependable, NPast_5_pers_anxious, - NPast_5_pers_complex, - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice) - - Global-Scope Means (30 items each): - Each mean averages 30 difference items across two related intervals - - Example for NPast_global_mean: - = mean(all 15 NPast_5 items + all 15 NPast_10 items) - Represents overall perceived change from present to any past timepoint - - Example for N5_global_mean: - = mean(all 15 NPast_5 items + all 15 NFut_5 items) - Represents overall perceived change at 5-year interval regardless of - direction - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF INTERVAL × DIRECTION MEANS: - - Narrow-scope means: Single-interval summaries across all domains and items - - Global-scope means: Cross-interval summaries for testing: - * Direction effects (past vs future) - * Interval effects (5-year vs 10-year) - * Combined temporal distance effects - - Enables comprehensive analysis of temporal self-perception patterns - - Reduces item-level and domain-level noise through broad aggregation - -QUALITY ASSURANCE: - - Script includes automated QA checks for first 5 rows - - Manually recalculates each mean and verifies against stored values - - Prints TRUE/FALSE match status for each variable - - Ensures calculation accuracy before further analysis - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - All means are averages of absolute difference scores (non-negative) - - Global means provide the broadest temporal self-perception summaries - - Naming convention uses "global" for 30-item means, no suffix for 15-item - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 202 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (16 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Mean scores: 4 (1 per time period) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 07) as later scripts depend - on variables created by earlier scripts. - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with random row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 1, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251008114354.txt b/.history/eohi2/README_Variable_Creation_20251008114354.txt deleted file mode 100644 index bd54bb7..0000000 --- a/.history/eohi2/README_Variable_Creation_20251008114354.txt +++ /dev/null @@ -1,659 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 07 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SCRIPT 08: dataP 08 - DGEN 510 vars.r -================================================================================ - -PURPOSE: - Calculates absolute differences between 5-year and 10-year DGEN ratings for - both Past and Future time directions. These variables measure the perceived - difference in domain-general change between the two time intervals. - -VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - Total: 12 DGEN columns (created in Script 03) - -TARGET VARIABLES: - Past Direction (3 variables): - - X5_10DGEN_past_pref (|DGEN_past_5_Pref - DGEN_past_10_Pref|) - - X5_10DGEN_past_pers (|DGEN_past_5_Pers - DGEN_past_10_Pers|) - - X5_10DGEN_past_val (|DGEN_past_5_Val - DGEN_past_10_Val|) - - Future Direction (3 variables): - - X5_10DGEN_fut_pref (|DGEN_fut_5_Pref - DGEN_fut_10_Pref|) - - X5_10DGEN_fut_pers (|DGEN_fut_5_Pers - DGEN_fut_10_Pers|) - - X5_10DGEN_fut_val (|DGEN_fut_5_Val - DGEN_fut_10_Val|) - -TRANSFORMATION LOGIC: - Formula: |DGEN_5 - DGEN_10| - - All calculations use absolute differences: - - Past Preferences: |DGEN_past_5_Pref - DGEN_past_10_Pref| - - Past Personality: |DGEN_past_5_Pers - DGEN_past_10_Pers| - - Past Values: |DGEN_past_5_Val - DGEN_past_10_Val| - - Future Preferences: |DGEN_fut_5_Pref - DGEN_fut_10_Pref| - - Future Personality: |DGEN_fut_5_Pers - DGEN_fut_10_Pers| - - Future Values: |DGEN_fut_5_Val - DGEN_fut_10_Val| - - Result: Always positive values representing magnitude of difference - Missing values in either source column result in NA - -SPECIAL NOTES: - - Variable names use "X" prefix because R automatically adds it to column - names starting with numbers (5_10 becomes X5_10) - - This script depends on Script 03 being run first - - Measures interval effects within time direction (past vs future) - - Parallel to Script 06's 5.10past and 5.10fut variables but for DGEN scores - - -================================================================================ -SCRIPT 09: dataP 09 - interval x direction means.r -================================================================================ - -PURPOSE: - Calculates comprehensive mean scores by averaging item-level differences - across intervals and directions. Creates both narrow-scope means (single - time interval) and broad-scope global means (combining multiple intervals). - -VARIABLES CREATED: 11 total (6 narrow-scope + 5 global-scope) - -SOURCE COLUMNS: - All 90 difference variables created in Script 06: - - NPast_5_[domain]_[item] (15 variables) - - NPast_10_[domain]_[item] (15 variables) - - NFut_5_[domain]_[item] (15 variables) - - NFut_10_[domain]_[item] (15 variables) - - X5.10past_[domain]_[item] (15 variables) - - X5.10fut_[domain]_[item] (15 variables) - -TARGET VARIABLES: - - Narrow-Scope Means (15 source items each): - - NPast_5_mean (mean across all 15 NPast_5 items) - - NPast_10_mean (mean across all 15 NPast_10 items) - - NFut_5_mean (mean across all 15 NFut_5 items) - - NFut_10_mean (mean across all 15 NFut_10 items) - - X5.10past_mean (mean across all 15 X5.10past items) - - X5.10fut_mean (mean across all 15 X5.10fut items) - - Global-Scope Means (30 source items each): - - NPast_global_mean (NPast_5 + NPast_10: all past intervals) - - NFut_global_mean (NFut_5 + NFut_10: all future intervals) - - X5.10_global_mean (X5.10past + X5.10fut: all 5-vs-10 intervals) - - N5_global_mean (NPast_5 + NFut_5: all 5-year intervals) - - N10_global_mean (NPast_10 + NFut_10: all 10-year intervals) - -TRANSFORMATION LOGIC: - - Narrow-Scope Means (15 items each): - Each mean averages all 15 difference items within one time interval - - Example for NPast_5_mean: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel, - NPast_5_pers_extravert, NPast_5_pers_critical, - NPast_5_pers_dependable, NPast_5_pers_anxious, - NPast_5_pers_complex, - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice) - - Global-Scope Means (30 items each): - Each mean averages 30 difference items across two related intervals - - Example for NPast_global_mean: - = mean(all 15 NPast_5 items + all 15 NPast_10 items) - Represents overall perceived change from present to any past timepoint - - Example for N5_global_mean: - = mean(all 15 NPast_5 items + all 15 NFut_5 items) - Represents overall perceived change at 5-year interval regardless of - direction - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF INTERVAL × DIRECTION MEANS: - - Narrow-scope means: Single-interval summaries across all domains and items - - Global-scope means: Cross-interval summaries for testing: - * Direction effects (past vs future) - * Interval effects (5-year vs 10-year) - * Combined temporal distance effects - - Enables comprehensive analysis of temporal self-perception patterns - - Reduces item-level and domain-level noise through broad aggregation - -QUALITY ASSURANCE: - - Script includes automated QA checks for first 5 rows - - Manually recalculates each mean and verifies against stored values - - Prints TRUE/FALSE match status for each variable - - Ensures calculation accuracy before further analysis - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - All means are averages of absolute difference scores (non-negative) - - Global means provide the broadest temporal self-perception summaries - - Naming convention uses "global" for 30-item means, no suffix for 15-item - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 219 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - - Script 08: 6 variables (DGEN 5-vs-10 differences) - - Script 09: 11 variables (interval × direction means) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (22 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Mean scores: 4 (1 per time period) - * 5-vs-10 differences: 6 (3 domains × 2 directions) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - - Interval × Direction Means (11 total): - * Narrow-scope means: 6 (NPast_5, NPast_10, NFut_5, NFut_10, - X5.10past, X5.10fut) - * Global-scope means: 5 (NPast_global, NFut_global, X5.10_global, - N5_global, N10_global) - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 07) as later scripts depend - on variables created by earlier scripts. - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with random row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 1, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251008114414.txt b/.history/eohi2/README_Variable_Creation_20251008114414.txt deleted file mode 100644 index 9062b01..0000000 --- a/.history/eohi2/README_Variable_Creation_20251008114414.txt +++ /dev/null @@ -1,670 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 07 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SCRIPT 08: dataP 08 - DGEN 510 vars.r -================================================================================ - -PURPOSE: - Calculates absolute differences between 5-year and 10-year DGEN ratings for - both Past and Future time directions. These variables measure the perceived - difference in domain-general change between the two time intervals. - -VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - Total: 12 DGEN columns (created in Script 03) - -TARGET VARIABLES: - Past Direction (3 variables): - - X5_10DGEN_past_pref (|DGEN_past_5_Pref - DGEN_past_10_Pref|) - - X5_10DGEN_past_pers (|DGEN_past_5_Pers - DGEN_past_10_Pers|) - - X5_10DGEN_past_val (|DGEN_past_5_Val - DGEN_past_10_Val|) - - Future Direction (3 variables): - - X5_10DGEN_fut_pref (|DGEN_fut_5_Pref - DGEN_fut_10_Pref|) - - X5_10DGEN_fut_pers (|DGEN_fut_5_Pers - DGEN_fut_10_Pers|) - - X5_10DGEN_fut_val (|DGEN_fut_5_Val - DGEN_fut_10_Val|) - -TRANSFORMATION LOGIC: - Formula: |DGEN_5 - DGEN_10| - - All calculations use absolute differences: - - Past Preferences: |DGEN_past_5_Pref - DGEN_past_10_Pref| - - Past Personality: |DGEN_past_5_Pers - DGEN_past_10_Pers| - - Past Values: |DGEN_past_5_Val - DGEN_past_10_Val| - - Future Preferences: |DGEN_fut_5_Pref - DGEN_fut_10_Pref| - - Future Personality: |DGEN_fut_5_Pers - DGEN_fut_10_Pers| - - Future Values: |DGEN_fut_5_Val - DGEN_fut_10_Val| - - Result: Always positive values representing magnitude of difference - Missing values in either source column result in NA - -SPECIAL NOTES: - - Variable names use "X" prefix because R automatically adds it to column - names starting with numbers (5_10 becomes X5_10) - - This script depends on Script 03 being run first - - Measures interval effects within time direction (past vs future) - - Parallel to Script 06's 5.10past and 5.10fut variables but for DGEN scores - - -================================================================================ -SCRIPT 09: dataP 09 - interval x direction means.r -================================================================================ - -PURPOSE: - Calculates comprehensive mean scores by averaging item-level differences - across intervals and directions. Creates both narrow-scope means (single - time interval) and broad-scope global means (combining multiple intervals). - -VARIABLES CREATED: 11 total (6 narrow-scope + 5 global-scope) - -SOURCE COLUMNS: - All 90 difference variables created in Script 06: - - NPast_5_[domain]_[item] (15 variables) - - NPast_10_[domain]_[item] (15 variables) - - NFut_5_[domain]_[item] (15 variables) - - NFut_10_[domain]_[item] (15 variables) - - X5.10past_[domain]_[item] (15 variables) - - X5.10fut_[domain]_[item] (15 variables) - -TARGET VARIABLES: - - Narrow-Scope Means (15 source items each): - - NPast_5_mean (mean across all 15 NPast_5 items) - - NPast_10_mean (mean across all 15 NPast_10 items) - - NFut_5_mean (mean across all 15 NFut_5 items) - - NFut_10_mean (mean across all 15 NFut_10 items) - - X5.10past_mean (mean across all 15 X5.10past items) - - X5.10fut_mean (mean across all 15 X5.10fut items) - - Global-Scope Means (30 source items each): - - NPast_global_mean (NPast_5 + NPast_10: all past intervals) - - NFut_global_mean (NFut_5 + NFut_10: all future intervals) - - X5.10_global_mean (X5.10past + X5.10fut: all 5-vs-10 intervals) - - N5_global_mean (NPast_5 + NFut_5: all 5-year intervals) - - N10_global_mean (NPast_10 + NFut_10: all 10-year intervals) - -TRANSFORMATION LOGIC: - - Narrow-Scope Means (15 items each): - Each mean averages all 15 difference items within one time interval - - Example for NPast_5_mean: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel, - NPast_5_pers_extravert, NPast_5_pers_critical, - NPast_5_pers_dependable, NPast_5_pers_anxious, - NPast_5_pers_complex, - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice) - - Global-Scope Means (30 items each): - Each mean averages 30 difference items across two related intervals - - Example for NPast_global_mean: - = mean(all 15 NPast_5 items + all 15 NPast_10 items) - Represents overall perceived change from present to any past timepoint - - Example for N5_global_mean: - = mean(all 15 NPast_5 items + all 15 NFut_5 items) - Represents overall perceived change at 5-year interval regardless of - direction - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF INTERVAL × DIRECTION MEANS: - - Narrow-scope means: Single-interval summaries across all domains and items - - Global-scope means: Cross-interval summaries for testing: - * Direction effects (past vs future) - * Interval effects (5-year vs 10-year) - * Combined temporal distance effects - - Enables comprehensive analysis of temporal self-perception patterns - - Reduces item-level and domain-level noise through broad aggregation - -QUALITY ASSURANCE: - - Script includes automated QA checks for first 5 rows - - Manually recalculates each mean and verifies against stored values - - Prints TRUE/FALSE match status for each variable - - Ensures calculation accuracy before further analysis - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - All means are averages of absolute difference scores (non-negative) - - Global means provide the broadest temporal self-perception summaries - - Naming convention uses "global" for 30-item means, no suffix for 15-item - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 219 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - - Script 08: 6 variables (DGEN 5-vs-10 differences) - - Script 09: 11 variables (interval × direction means) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (22 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Mean scores: 4 (1 per time period) - * 5-vs-10 differences: 6 (3 domains × 2 directions) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - - Interval × Direction Means (11 total): - * Narrow-scope means: 6 (NPast_5, NPast_10, NFut_5, NFut_10, - X5.10past, X5.10fut) - * Global-scope means: 5 (NPast_global, NFut_global, X5.10_global, - N5_global, N10_global) - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 09) as later scripts depend - on variables created by earlier scripts. - - Key Dependencies: - - Script 03 required before Script 04 (DGEN means need DGEN scores) - - Script 03 required before Script 08 (DGEN 5-vs-10 need DGEN scores) - - Script 06 required before Script 07 (domain means need differences) - - Script 06 required before Script 09 (interval × direction means need - differences) - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with random row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - - Script 09 includes comprehensive QA with first 5 rows for all 11 variables - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Scripts 08 and 09 save directly to eohi2.csv - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - "X" prefix added to variables starting with numbers (X5.10past_mean) - - Global means use "_global_" to distinguish from narrow-scope means - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 1, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251008114419.txt b/.history/eohi2/README_Variable_Creation_20251008114419.txt deleted file mode 100644 index fba6b31..0000000 --- a/.history/eohi2/README_Variable_Creation_20251008114419.txt +++ /dev/null @@ -1,670 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 07 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SCRIPT 08: dataP 08 - DGEN 510 vars.r -================================================================================ - -PURPOSE: - Calculates absolute differences between 5-year and 10-year DGEN ratings for - both Past and Future time directions. These variables measure the perceived - difference in domain-general change between the two time intervals. - -VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - Total: 12 DGEN columns (created in Script 03) - -TARGET VARIABLES: - Past Direction (3 variables): - - X5_10DGEN_past_pref (|DGEN_past_5_Pref - DGEN_past_10_Pref|) - - X5_10DGEN_past_pers (|DGEN_past_5_Pers - DGEN_past_10_Pers|) - - X5_10DGEN_past_val (|DGEN_past_5_Val - DGEN_past_10_Val|) - - Future Direction (3 variables): - - X5_10DGEN_fut_pref (|DGEN_fut_5_Pref - DGEN_fut_10_Pref|) - - X5_10DGEN_fut_pers (|DGEN_fut_5_Pers - DGEN_fut_10_Pers|) - - X5_10DGEN_fut_val (|DGEN_fut_5_Val - DGEN_fut_10_Val|) - -TRANSFORMATION LOGIC: - Formula: |DGEN_5 - DGEN_10| - - All calculations use absolute differences: - - Past Preferences: |DGEN_past_5_Pref - DGEN_past_10_Pref| - - Past Personality: |DGEN_past_5_Pers - DGEN_past_10_Pers| - - Past Values: |DGEN_past_5_Val - DGEN_past_10_Val| - - Future Preferences: |DGEN_fut_5_Pref - DGEN_fut_10_Pref| - - Future Personality: |DGEN_fut_5_Pers - DGEN_fut_10_Pers| - - Future Values: |DGEN_fut_5_Val - DGEN_fut_10_Val| - - Result: Always positive values representing magnitude of difference - Missing values in either source column result in NA - -SPECIAL NOTES: - - Variable names use "X" prefix because R automatically adds it to column - names starting with numbers (5_10 becomes X5_10) - - This script depends on Script 03 being run first - - Measures interval effects within time direction (past vs future) - - Parallel to Script 06's 5.10past and 5.10fut variables but for DGEN scores - - -================================================================================ -SCRIPT 09: dataP 09 - interval x direction means.r -================================================================================ - -PURPOSE: - Calculates comprehensive mean scores by averaging item-level differences - across intervals and directions. Creates both narrow-scope means (single - time interval) and broad-scope global means (combining multiple intervals). - -VARIABLES CREATED: 11 total (6 narrow-scope + 5 global-scope) - -SOURCE COLUMNS: - All 90 difference variables created in Script 06: - - NPast_5_[domain]_[item] (15 variables) - - NPast_10_[domain]_[item] (15 variables) - - NFut_5_[domain]_[item] (15 variables) - - NFut_10_[domain]_[item] (15 variables) - - X5.10past_[domain]_[item] (15 variables) - - X5.10fut_[domain]_[item] (15 variables) - -TARGET VARIABLES: - - Narrow-Scope Means (15 source items each): - - NPast_5_mean (mean across all 15 NPast_5 items) - - NPast_10_mean (mean across all 15 NPast_10 items) - - NFut_5_mean (mean across all 15 NFut_5 items) - - NFut_10_mean (mean across all 15 NFut_10 items) - - X5.10past_mean (mean across all 15 X5.10past items) - - X5.10fut_mean (mean across all 15 X5.10fut items) - - Global-Scope Means (30 source items each): - - NPast_global_mean (NPast_5 + NPast_10: all past intervals) - - NFut_global_mean (NFut_5 + NFut_10: all future intervals) - - X5.10_global_mean (X5.10past + X5.10fut: all 5-vs-10 intervals) - - N5_global_mean (NPast_5 + NFut_5: all 5-year intervals) - - N10_global_mean (NPast_10 + NFut_10: all 10-year intervals) - -TRANSFORMATION LOGIC: - - Narrow-Scope Means (15 items each): - Each mean averages all 15 difference items within one time interval - - Example for NPast_5_mean: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel, - NPast_5_pers_extravert, NPast_5_pers_critical, - NPast_5_pers_dependable, NPast_5_pers_anxious, - NPast_5_pers_complex, - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice) - - Global-Scope Means (30 items each): - Each mean averages 30 difference items across two related intervals - - Example for NPast_global_mean: - = mean(all 15 NPast_5 items + all 15 NPast_10 items) - Represents overall perceived change from present to any past timepoint - - Example for N5_global_mean: - = mean(all 15 NPast_5 items + all 15 NFut_5 items) - Represents overall perceived change at 5-year interval regardless of - direction - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF INTERVAL × DIRECTION MEANS: - - Narrow-scope means: Single-interval summaries across all domains and items - - Global-scope means: Cross-interval summaries for testing: - * Direction effects (past vs future) - * Interval effects (5-year vs 10-year) - * Combined temporal distance effects - - Enables comprehensive analysis of temporal self-perception patterns - - Reduces item-level and domain-level noise through broad aggregation - -QUALITY ASSURANCE: - - Script includes automated QA checks for first 5 rows - - Manually recalculates each mean and verifies against stored values - - Prints TRUE/FALSE match status for each variable - - Ensures calculation accuracy before further analysis - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - All means are averages of absolute difference scores (non-negative) - - Global means provide the broadest temporal self-perception summaries - - Naming convention uses "global" for 30-item means, no suffix for 15-item - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 219 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - - Script 08: 6 variables (DGEN 5-vs-10 differences) - - Script 09: 11 variables (interval × direction means) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (22 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Mean scores: 4 (1 per time period) - * 5-vs-10 differences: 6 (3 domains × 2 directions) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - - Interval × Direction Means (11 total): - * Narrow-scope means: 6 (NPast_5, NPast_10, NFut_5, NFut_10, - X5.10past, X5.10fut) - * Global-scope means: 5 (NPast_global, NFut_global, X5.10_global, - N5_global, N10_global) - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 09) as later scripts depend - on variables created by earlier scripts. - - Key Dependencies: - - Script 03 required before Script 04 (DGEN means need DGEN scores) - - Script 03 required before Script 08 (DGEN 5-vs-10 need DGEN scores) - - Script 06 required before Script 07 (domain means need differences) - - Script 06 required before Script 09 (interval × direction means need - differences) - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with random row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - - Script 09 includes comprehensive QA with first 5 rows for all 11 variables - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Scripts 08 and 09 save directly to eohi2.csv - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - "X" prefix added to variables starting with numbers (X5.10past_mean) - - Global means use "_global_" to distinguish from narrow-scope means - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 8, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251008114428.txt b/.history/eohi2/README_Variable_Creation_20251008114428.txt deleted file mode 100644 index fc57604..0000000 --- a/.history/eohi2/README_Variable_Creation_20251008114428.txt +++ /dev/null @@ -1,670 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 09 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SCRIPT 08: dataP 08 - DGEN 510 vars.r -================================================================================ - -PURPOSE: - Calculates absolute differences between 5-year and 10-year DGEN ratings for - both Past and Future time directions. These variables measure the perceived - difference in domain-general change between the two time intervals. - -VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - Total: 12 DGEN columns (created in Script 03) - -TARGET VARIABLES: - Past Direction (3 variables): - - X5_10DGEN_past_pref (|DGEN_past_5_Pref - DGEN_past_10_Pref|) - - X5_10DGEN_past_pers (|DGEN_past_5_Pers - DGEN_past_10_Pers|) - - X5_10DGEN_past_val (|DGEN_past_5_Val - DGEN_past_10_Val|) - - Future Direction (3 variables): - - X5_10DGEN_fut_pref (|DGEN_fut_5_Pref - DGEN_fut_10_Pref|) - - X5_10DGEN_fut_pers (|DGEN_fut_5_Pers - DGEN_fut_10_Pers|) - - X5_10DGEN_fut_val (|DGEN_fut_5_Val - DGEN_fut_10_Val|) - -TRANSFORMATION LOGIC: - Formula: |DGEN_5 - DGEN_10| - - All calculations use absolute differences: - - Past Preferences: |DGEN_past_5_Pref - DGEN_past_10_Pref| - - Past Personality: |DGEN_past_5_Pers - DGEN_past_10_Pers| - - Past Values: |DGEN_past_5_Val - DGEN_past_10_Val| - - Future Preferences: |DGEN_fut_5_Pref - DGEN_fut_10_Pref| - - Future Personality: |DGEN_fut_5_Pers - DGEN_fut_10_Pers| - - Future Values: |DGEN_fut_5_Val - DGEN_fut_10_Val| - - Result: Always positive values representing magnitude of difference - Missing values in either source column result in NA - -SPECIAL NOTES: - - Variable names use "X" prefix because R automatically adds it to column - names starting with numbers (5_10 becomes X5_10) - - This script depends on Script 03 being run first - - Measures interval effects within time direction (past vs future) - - Parallel to Script 06's 5.10past and 5.10fut variables but for DGEN scores - - -================================================================================ -SCRIPT 09: dataP 09 - interval x direction means.r -================================================================================ - -PURPOSE: - Calculates comprehensive mean scores by averaging item-level differences - across intervals and directions. Creates both narrow-scope means (single - time interval) and broad-scope global means (combining multiple intervals). - -VARIABLES CREATED: 11 total (6 narrow-scope + 5 global-scope) - -SOURCE COLUMNS: - All 90 difference variables created in Script 06: - - NPast_5_[domain]_[item] (15 variables) - - NPast_10_[domain]_[item] (15 variables) - - NFut_5_[domain]_[item] (15 variables) - - NFut_10_[domain]_[item] (15 variables) - - X5.10past_[domain]_[item] (15 variables) - - X5.10fut_[domain]_[item] (15 variables) - -TARGET VARIABLES: - - Narrow-Scope Means (15 source items each): - - NPast_5_mean (mean across all 15 NPast_5 items) - - NPast_10_mean (mean across all 15 NPast_10 items) - - NFut_5_mean (mean across all 15 NFut_5 items) - - NFut_10_mean (mean across all 15 NFut_10 items) - - X5.10past_mean (mean across all 15 X5.10past items) - - X5.10fut_mean (mean across all 15 X5.10fut items) - - Global-Scope Means (30 source items each): - - NPast_global_mean (NPast_5 + NPast_10: all past intervals) - - NFut_global_mean (NFut_5 + NFut_10: all future intervals) - - X5.10_global_mean (X5.10past + X5.10fut: all 5-vs-10 intervals) - - N5_global_mean (NPast_5 + NFut_5: all 5-year intervals) - - N10_global_mean (NPast_10 + NFut_10: all 10-year intervals) - -TRANSFORMATION LOGIC: - - Narrow-Scope Means (15 items each): - Each mean averages all 15 difference items within one time interval - - Example for NPast_5_mean: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel, - NPast_5_pers_extravert, NPast_5_pers_critical, - NPast_5_pers_dependable, NPast_5_pers_anxious, - NPast_5_pers_complex, - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice) - - Global-Scope Means (30 items each): - Each mean averages 30 difference items across two related intervals - - Example for NPast_global_mean: - = mean(all 15 NPast_5 items + all 15 NPast_10 items) - Represents overall perceived change from present to any past timepoint - - Example for N5_global_mean: - = mean(all 15 NPast_5 items + all 15 NFut_5 items) - Represents overall perceived change at 5-year interval regardless of - direction - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF INTERVAL × DIRECTION MEANS: - - Narrow-scope means: Single-interval summaries across all domains and items - - Global-scope means: Cross-interval summaries for testing: - * Direction effects (past vs future) - * Interval effects (5-year vs 10-year) - * Combined temporal distance effects - - Enables comprehensive analysis of temporal self-perception patterns - - Reduces item-level and domain-level noise through broad aggregation - -QUALITY ASSURANCE: - - Script includes automated QA checks for first 5 rows - - Manually recalculates each mean and verifies against stored values - - Prints TRUE/FALSE match status for each variable - - Ensures calculation accuracy before further analysis - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - All means are averages of absolute difference scores (non-negative) - - Global means provide the broadest temporal self-perception summaries - - Naming convention uses "global" for 30-item means, no suffix for 15-item - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 219 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - - Script 08: 6 variables (DGEN 5-vs-10 differences) - - Script 09: 11 variables (interval × direction means) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (22 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Mean scores: 4 (1 per time period) - * 5-vs-10 differences: 6 (3 domains × 2 directions) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - - Interval × Direction Means (11 total): - * Narrow-scope means: 6 (NPast_5, NPast_10, NFut_5, NFut_10, - X5.10past, X5.10fut) - * Global-scope means: 5 (NPast_global, NFut_global, X5.10_global, - N5_global, N10_global) - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 09) as later scripts depend - on variables created by earlier scripts. - - Key Dependencies: - - Script 03 required before Script 04 (DGEN means need DGEN scores) - - Script 03 required before Script 08 (DGEN 5-vs-10 need DGEN scores) - - Script 06 required before Script 07 (domain means need differences) - - Script 06 required before Script 09 (interval × direction means need - differences) - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with random row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - - Script 09 includes comprehensive QA with first 5 rows for all 11 variables - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Scripts 08 and 09 save directly to eohi2.csv - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - "X" prefix added to variables starting with numbers (X5.10past_mean) - - Global means use "_global_" to distinguish from narrow-scope means - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 8, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251008114444.txt b/.history/eohi2/README_Variable_Creation_20251008114444.txt deleted file mode 100644 index fc57604..0000000 --- a/.history/eohi2/README_Variable_Creation_20251008114444.txt +++ /dev/null @@ -1,670 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 09 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SCRIPT 08: dataP 08 - DGEN 510 vars.r -================================================================================ - -PURPOSE: - Calculates absolute differences between 5-year and 10-year DGEN ratings for - both Past and Future time directions. These variables measure the perceived - difference in domain-general change between the two time intervals. - -VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - Total: 12 DGEN columns (created in Script 03) - -TARGET VARIABLES: - Past Direction (3 variables): - - X5_10DGEN_past_pref (|DGEN_past_5_Pref - DGEN_past_10_Pref|) - - X5_10DGEN_past_pers (|DGEN_past_5_Pers - DGEN_past_10_Pers|) - - X5_10DGEN_past_val (|DGEN_past_5_Val - DGEN_past_10_Val|) - - Future Direction (3 variables): - - X5_10DGEN_fut_pref (|DGEN_fut_5_Pref - DGEN_fut_10_Pref|) - - X5_10DGEN_fut_pers (|DGEN_fut_5_Pers - DGEN_fut_10_Pers|) - - X5_10DGEN_fut_val (|DGEN_fut_5_Val - DGEN_fut_10_Val|) - -TRANSFORMATION LOGIC: - Formula: |DGEN_5 - DGEN_10| - - All calculations use absolute differences: - - Past Preferences: |DGEN_past_5_Pref - DGEN_past_10_Pref| - - Past Personality: |DGEN_past_5_Pers - DGEN_past_10_Pers| - - Past Values: |DGEN_past_5_Val - DGEN_past_10_Val| - - Future Preferences: |DGEN_fut_5_Pref - DGEN_fut_10_Pref| - - Future Personality: |DGEN_fut_5_Pers - DGEN_fut_10_Pers| - - Future Values: |DGEN_fut_5_Val - DGEN_fut_10_Val| - - Result: Always positive values representing magnitude of difference - Missing values in either source column result in NA - -SPECIAL NOTES: - - Variable names use "X" prefix because R automatically adds it to column - names starting with numbers (5_10 becomes X5_10) - - This script depends on Script 03 being run first - - Measures interval effects within time direction (past vs future) - - Parallel to Script 06's 5.10past and 5.10fut variables but for DGEN scores - - -================================================================================ -SCRIPT 09: dataP 09 - interval x direction means.r -================================================================================ - -PURPOSE: - Calculates comprehensive mean scores by averaging item-level differences - across intervals and directions. Creates both narrow-scope means (single - time interval) and broad-scope global means (combining multiple intervals). - -VARIABLES CREATED: 11 total (6 narrow-scope + 5 global-scope) - -SOURCE COLUMNS: - All 90 difference variables created in Script 06: - - NPast_5_[domain]_[item] (15 variables) - - NPast_10_[domain]_[item] (15 variables) - - NFut_5_[domain]_[item] (15 variables) - - NFut_10_[domain]_[item] (15 variables) - - X5.10past_[domain]_[item] (15 variables) - - X5.10fut_[domain]_[item] (15 variables) - -TARGET VARIABLES: - - Narrow-Scope Means (15 source items each): - - NPast_5_mean (mean across all 15 NPast_5 items) - - NPast_10_mean (mean across all 15 NPast_10 items) - - NFut_5_mean (mean across all 15 NFut_5 items) - - NFut_10_mean (mean across all 15 NFut_10 items) - - X5.10past_mean (mean across all 15 X5.10past items) - - X5.10fut_mean (mean across all 15 X5.10fut items) - - Global-Scope Means (30 source items each): - - NPast_global_mean (NPast_5 + NPast_10: all past intervals) - - NFut_global_mean (NFut_5 + NFut_10: all future intervals) - - X5.10_global_mean (X5.10past + X5.10fut: all 5-vs-10 intervals) - - N5_global_mean (NPast_5 + NFut_5: all 5-year intervals) - - N10_global_mean (NPast_10 + NFut_10: all 10-year intervals) - -TRANSFORMATION LOGIC: - - Narrow-Scope Means (15 items each): - Each mean averages all 15 difference items within one time interval - - Example for NPast_5_mean: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel, - NPast_5_pers_extravert, NPast_5_pers_critical, - NPast_5_pers_dependable, NPast_5_pers_anxious, - NPast_5_pers_complex, - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice) - - Global-Scope Means (30 items each): - Each mean averages 30 difference items across two related intervals - - Example for NPast_global_mean: - = mean(all 15 NPast_5 items + all 15 NPast_10 items) - Represents overall perceived change from present to any past timepoint - - Example for N5_global_mean: - = mean(all 15 NPast_5 items + all 15 NFut_5 items) - Represents overall perceived change at 5-year interval regardless of - direction - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF INTERVAL × DIRECTION MEANS: - - Narrow-scope means: Single-interval summaries across all domains and items - - Global-scope means: Cross-interval summaries for testing: - * Direction effects (past vs future) - * Interval effects (5-year vs 10-year) - * Combined temporal distance effects - - Enables comprehensive analysis of temporal self-perception patterns - - Reduces item-level and domain-level noise through broad aggregation - -QUALITY ASSURANCE: - - Script includes automated QA checks for first 5 rows - - Manually recalculates each mean and verifies against stored values - - Prints TRUE/FALSE match status for each variable - - Ensures calculation accuracy before further analysis - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - All means are averages of absolute difference scores (non-negative) - - Global means provide the broadest temporal self-perception summaries - - Naming convention uses "global" for 30-item means, no suffix for 15-item - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 219 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - - Script 08: 6 variables (DGEN 5-vs-10 differences) - - Script 09: 11 variables (interval × direction means) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (22 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Mean scores: 4 (1 per time period) - * 5-vs-10 differences: 6 (3 domains × 2 directions) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - - Interval × Direction Means (11 total): - * Narrow-scope means: 6 (NPast_5, NPast_10, NFut_5, NFut_10, - X5.10past, X5.10fut) - * Global-scope means: 5 (NPast_global, NFut_global, X5.10_global, - N5_global, N10_global) - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 09) as later scripts depend - on variables created by earlier scripts. - - Key Dependencies: - - Script 03 required before Script 04 (DGEN means need DGEN scores) - - Script 03 required before Script 08 (DGEN 5-vs-10 need DGEN scores) - - Script 06 required before Script 07 (domain means need differences) - - Script 06 required before Script 09 (interval × direction means need - differences) - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with random row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - - Script 09 includes comprehensive QA with first 5 rows for all 11 variables - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Scripts 08 and 09 save directly to eohi2.csv - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - "X" prefix added to variables starting with numbers (X5.10past_mean) - - Global means use "_global_" to distinguish from narrow-scope means - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 8, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251008114508.txt b/.history/eohi2/README_Variable_Creation_20251008114508.txt deleted file mode 100644 index fc57604..0000000 --- a/.history/eohi2/README_Variable_Creation_20251008114508.txt +++ /dev/null @@ -1,670 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 09 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SCRIPT 08: dataP 08 - DGEN 510 vars.r -================================================================================ - -PURPOSE: - Calculates absolute differences between 5-year and 10-year DGEN ratings for - both Past and Future time directions. These variables measure the perceived - difference in domain-general change between the two time intervals. - -VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - Total: 12 DGEN columns (created in Script 03) - -TARGET VARIABLES: - Past Direction (3 variables): - - X5_10DGEN_past_pref (|DGEN_past_5_Pref - DGEN_past_10_Pref|) - - X5_10DGEN_past_pers (|DGEN_past_5_Pers - DGEN_past_10_Pers|) - - X5_10DGEN_past_val (|DGEN_past_5_Val - DGEN_past_10_Val|) - - Future Direction (3 variables): - - X5_10DGEN_fut_pref (|DGEN_fut_5_Pref - DGEN_fut_10_Pref|) - - X5_10DGEN_fut_pers (|DGEN_fut_5_Pers - DGEN_fut_10_Pers|) - - X5_10DGEN_fut_val (|DGEN_fut_5_Val - DGEN_fut_10_Val|) - -TRANSFORMATION LOGIC: - Formula: |DGEN_5 - DGEN_10| - - All calculations use absolute differences: - - Past Preferences: |DGEN_past_5_Pref - DGEN_past_10_Pref| - - Past Personality: |DGEN_past_5_Pers - DGEN_past_10_Pers| - - Past Values: |DGEN_past_5_Val - DGEN_past_10_Val| - - Future Preferences: |DGEN_fut_5_Pref - DGEN_fut_10_Pref| - - Future Personality: |DGEN_fut_5_Pers - DGEN_fut_10_Pers| - - Future Values: |DGEN_fut_5_Val - DGEN_fut_10_Val| - - Result: Always positive values representing magnitude of difference - Missing values in either source column result in NA - -SPECIAL NOTES: - - Variable names use "X" prefix because R automatically adds it to column - names starting with numbers (5_10 becomes X5_10) - - This script depends on Script 03 being run first - - Measures interval effects within time direction (past vs future) - - Parallel to Script 06's 5.10past and 5.10fut variables but for DGEN scores - - -================================================================================ -SCRIPT 09: dataP 09 - interval x direction means.r -================================================================================ - -PURPOSE: - Calculates comprehensive mean scores by averaging item-level differences - across intervals and directions. Creates both narrow-scope means (single - time interval) and broad-scope global means (combining multiple intervals). - -VARIABLES CREATED: 11 total (6 narrow-scope + 5 global-scope) - -SOURCE COLUMNS: - All 90 difference variables created in Script 06: - - NPast_5_[domain]_[item] (15 variables) - - NPast_10_[domain]_[item] (15 variables) - - NFut_5_[domain]_[item] (15 variables) - - NFut_10_[domain]_[item] (15 variables) - - X5.10past_[domain]_[item] (15 variables) - - X5.10fut_[domain]_[item] (15 variables) - -TARGET VARIABLES: - - Narrow-Scope Means (15 source items each): - - NPast_5_mean (mean across all 15 NPast_5 items) - - NPast_10_mean (mean across all 15 NPast_10 items) - - NFut_5_mean (mean across all 15 NFut_5 items) - - NFut_10_mean (mean across all 15 NFut_10 items) - - X5.10past_mean (mean across all 15 X5.10past items) - - X5.10fut_mean (mean across all 15 X5.10fut items) - - Global-Scope Means (30 source items each): - - NPast_global_mean (NPast_5 + NPast_10: all past intervals) - - NFut_global_mean (NFut_5 + NFut_10: all future intervals) - - X5.10_global_mean (X5.10past + X5.10fut: all 5-vs-10 intervals) - - N5_global_mean (NPast_5 + NFut_5: all 5-year intervals) - - N10_global_mean (NPast_10 + NFut_10: all 10-year intervals) - -TRANSFORMATION LOGIC: - - Narrow-Scope Means (15 items each): - Each mean averages all 15 difference items within one time interval - - Example for NPast_5_mean: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel, - NPast_5_pers_extravert, NPast_5_pers_critical, - NPast_5_pers_dependable, NPast_5_pers_anxious, - NPast_5_pers_complex, - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice) - - Global-Scope Means (30 items each): - Each mean averages 30 difference items across two related intervals - - Example for NPast_global_mean: - = mean(all 15 NPast_5 items + all 15 NPast_10 items) - Represents overall perceived change from present to any past timepoint - - Example for N5_global_mean: - = mean(all 15 NPast_5 items + all 15 NFut_5 items) - Represents overall perceived change at 5-year interval regardless of - direction - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF INTERVAL × DIRECTION MEANS: - - Narrow-scope means: Single-interval summaries across all domains and items - - Global-scope means: Cross-interval summaries for testing: - * Direction effects (past vs future) - * Interval effects (5-year vs 10-year) - * Combined temporal distance effects - - Enables comprehensive analysis of temporal self-perception patterns - - Reduces item-level and domain-level noise through broad aggregation - -QUALITY ASSURANCE: - - Script includes automated QA checks for first 5 rows - - Manually recalculates each mean and verifies against stored values - - Prints TRUE/FALSE match status for each variable - - Ensures calculation accuracy before further analysis - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - All means are averages of absolute difference scores (non-negative) - - Global means provide the broadest temporal self-perception summaries - - Naming convention uses "global" for 30-item means, no suffix for 15-item - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 219 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - - Script 08: 6 variables (DGEN 5-vs-10 differences) - - Script 09: 11 variables (interval × direction means) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (22 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Mean scores: 4 (1 per time period) - * 5-vs-10 differences: 6 (3 domains × 2 directions) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - - Interval × Direction Means (11 total): - * Narrow-scope means: 6 (NPast_5, NPast_10, NFut_5, NFut_10, - X5.10past, X5.10fut) - * Global-scope means: 5 (NPast_global, NFut_global, X5.10_global, - N5_global, N10_global) - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 09) as later scripts depend - on variables created by earlier scripts. - - Key Dependencies: - - Script 03 required before Script 04 (DGEN means need DGEN scores) - - Script 03 required before Script 08 (DGEN 5-vs-10 need DGEN scores) - - Script 06 required before Script 07 (domain means need differences) - - Script 06 required before Script 09 (interval × direction means need - differences) - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with random row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - - Script 09 includes comprehensive QA with first 5 rows for all 11 variables - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Scripts 08 and 09 save directly to eohi2.csv - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - "X" prefix added to variables starting with numbers (X5.10past_mean) - - Global means use "_global_" to distinguish from narrow-scope means - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 8, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251008114531.txt b/.history/eohi2/README_Variable_Creation_20251008114531.txt deleted file mode 100644 index fc57604..0000000 --- a/.history/eohi2/README_Variable_Creation_20251008114531.txt +++ /dev/null @@ -1,670 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 09 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SCRIPT 08: dataP 08 - DGEN 510 vars.r -================================================================================ - -PURPOSE: - Calculates absolute differences between 5-year and 10-year DGEN ratings for - both Past and Future time directions. These variables measure the perceived - difference in domain-general change between the two time intervals. - -VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - Total: 12 DGEN columns (created in Script 03) - -TARGET VARIABLES: - Past Direction (3 variables): - - X5_10DGEN_past_pref (|DGEN_past_5_Pref - DGEN_past_10_Pref|) - - X5_10DGEN_past_pers (|DGEN_past_5_Pers - DGEN_past_10_Pers|) - - X5_10DGEN_past_val (|DGEN_past_5_Val - DGEN_past_10_Val|) - - Future Direction (3 variables): - - X5_10DGEN_fut_pref (|DGEN_fut_5_Pref - DGEN_fut_10_Pref|) - - X5_10DGEN_fut_pers (|DGEN_fut_5_Pers - DGEN_fut_10_Pers|) - - X5_10DGEN_fut_val (|DGEN_fut_5_Val - DGEN_fut_10_Val|) - -TRANSFORMATION LOGIC: - Formula: |DGEN_5 - DGEN_10| - - All calculations use absolute differences: - - Past Preferences: |DGEN_past_5_Pref - DGEN_past_10_Pref| - - Past Personality: |DGEN_past_5_Pers - DGEN_past_10_Pers| - - Past Values: |DGEN_past_5_Val - DGEN_past_10_Val| - - Future Preferences: |DGEN_fut_5_Pref - DGEN_fut_10_Pref| - - Future Personality: |DGEN_fut_5_Pers - DGEN_fut_10_Pers| - - Future Values: |DGEN_fut_5_Val - DGEN_fut_10_Val| - - Result: Always positive values representing magnitude of difference - Missing values in either source column result in NA - -SPECIAL NOTES: - - Variable names use "X" prefix because R automatically adds it to column - names starting with numbers (5_10 becomes X5_10) - - This script depends on Script 03 being run first - - Measures interval effects within time direction (past vs future) - - Parallel to Script 06's 5.10past and 5.10fut variables but for DGEN scores - - -================================================================================ -SCRIPT 09: dataP 09 - interval x direction means.r -================================================================================ - -PURPOSE: - Calculates comprehensive mean scores by averaging item-level differences - across intervals and directions. Creates both narrow-scope means (single - time interval) and broad-scope global means (combining multiple intervals). - -VARIABLES CREATED: 11 total (6 narrow-scope + 5 global-scope) - -SOURCE COLUMNS: - All 90 difference variables created in Script 06: - - NPast_5_[domain]_[item] (15 variables) - - NPast_10_[domain]_[item] (15 variables) - - NFut_5_[domain]_[item] (15 variables) - - NFut_10_[domain]_[item] (15 variables) - - X5.10past_[domain]_[item] (15 variables) - - X5.10fut_[domain]_[item] (15 variables) - -TARGET VARIABLES: - - Narrow-Scope Means (15 source items each): - - NPast_5_mean (mean across all 15 NPast_5 items) - - NPast_10_mean (mean across all 15 NPast_10 items) - - NFut_5_mean (mean across all 15 NFut_5 items) - - NFut_10_mean (mean across all 15 NFut_10 items) - - X5.10past_mean (mean across all 15 X5.10past items) - - X5.10fut_mean (mean across all 15 X5.10fut items) - - Global-Scope Means (30 source items each): - - NPast_global_mean (NPast_5 + NPast_10: all past intervals) - - NFut_global_mean (NFut_5 + NFut_10: all future intervals) - - X5.10_global_mean (X5.10past + X5.10fut: all 5-vs-10 intervals) - - N5_global_mean (NPast_5 + NFut_5: all 5-year intervals) - - N10_global_mean (NPast_10 + NFut_10: all 10-year intervals) - -TRANSFORMATION LOGIC: - - Narrow-Scope Means (15 items each): - Each mean averages all 15 difference items within one time interval - - Example for NPast_5_mean: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel, - NPast_5_pers_extravert, NPast_5_pers_critical, - NPast_5_pers_dependable, NPast_5_pers_anxious, - NPast_5_pers_complex, - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice) - - Global-Scope Means (30 items each): - Each mean averages 30 difference items across two related intervals - - Example for NPast_global_mean: - = mean(all 15 NPast_5 items + all 15 NPast_10 items) - Represents overall perceived change from present to any past timepoint - - Example for N5_global_mean: - = mean(all 15 NPast_5 items + all 15 NFut_5 items) - Represents overall perceived change at 5-year interval regardless of - direction - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF INTERVAL × DIRECTION MEANS: - - Narrow-scope means: Single-interval summaries across all domains and items - - Global-scope means: Cross-interval summaries for testing: - * Direction effects (past vs future) - * Interval effects (5-year vs 10-year) - * Combined temporal distance effects - - Enables comprehensive analysis of temporal self-perception patterns - - Reduces item-level and domain-level noise through broad aggregation - -QUALITY ASSURANCE: - - Script includes automated QA checks for first 5 rows - - Manually recalculates each mean and verifies against stored values - - Prints TRUE/FALSE match status for each variable - - Ensures calculation accuracy before further analysis - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - All means are averages of absolute difference scores (non-negative) - - Global means provide the broadest temporal self-perception summaries - - Naming convention uses "global" for 30-item means, no suffix for 15-item - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 219 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - - Script 08: 6 variables (DGEN 5-vs-10 differences) - - Script 09: 11 variables (interval × direction means) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (22 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Mean scores: 4 (1 per time period) - * 5-vs-10 differences: 6 (3 domains × 2 directions) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - - Interval × Direction Means (11 total): - * Narrow-scope means: 6 (NPast_5, NPast_10, NFut_5, NFut_10, - X5.10past, X5.10fut) - * Global-scope means: 5 (NPast_global, NFut_global, X5.10_global, - N5_global, N10_global) - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 09) as later scripts depend - on variables created by earlier scripts. - - Key Dependencies: - - Script 03 required before Script 04 (DGEN means need DGEN scores) - - Script 03 required before Script 08 (DGEN 5-vs-10 need DGEN scores) - - Script 06 required before Script 07 (domain means need differences) - - Script 06 required before Script 09 (interval × direction means need - differences) - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with random row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - - Script 09 includes comprehensive QA with first 5 rows for all 11 variables - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Scripts 08 and 09 save directly to eohi2.csv - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - "X" prefix added to variables starting with numbers (X5.10past_mean) - - Global means use "_global_" to distinguish from narrow-scope means - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 8, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251008171443.txt b/.history/eohi2/README_Variable_Creation_20251008171443.txt deleted file mode 100644 index e3d8555..0000000 --- a/.history/eohi2/README_Variable_Creation_20251008171443.txt +++ /dev/null @@ -1,921 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 09 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SCRIPT 08: dataP 08 - DGEN 510 vars.r -================================================================================ - -PURPOSE: - Calculates absolute differences between 5-year and 10-year DGEN ratings for - both Past and Future time directions. These variables measure the perceived - difference in domain-general change between the two time intervals. - -VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - Total: 12 DGEN columns (created in Script 03) - -TARGET VARIABLES: - Past Direction (3 variables): - - X5_10DGEN_past_pref (|DGEN_past_5_Pref - DGEN_past_10_Pref|) - - X5_10DGEN_past_pers (|DGEN_past_5_Pers - DGEN_past_10_Pers|) - - X5_10DGEN_past_val (|DGEN_past_5_Val - DGEN_past_10_Val|) - - Future Direction (3 variables): - - X5_10DGEN_fut_pref (|DGEN_fut_5_Pref - DGEN_fut_10_Pref|) - - X5_10DGEN_fut_pers (|DGEN_fut_5_Pers - DGEN_fut_10_Pers|) - - X5_10DGEN_fut_val (|DGEN_fut_5_Val - DGEN_fut_10_Val|) - -TRANSFORMATION LOGIC: - Formula: |DGEN_5 - DGEN_10| - - All calculations use absolute differences: - - Past Preferences: |DGEN_past_5_Pref - DGEN_past_10_Pref| - - Past Personality: |DGEN_past_5_Pers - DGEN_past_10_Pers| - - Past Values: |DGEN_past_5_Val - DGEN_past_10_Val| - - Future Preferences: |DGEN_fut_5_Pref - DGEN_fut_10_Pref| - - Future Personality: |DGEN_fut_5_Pers - DGEN_fut_10_Pers| - - Future Values: |DGEN_fut_5_Val - DGEN_fut_10_Val| - - Result: Always positive values representing magnitude of difference - Missing values in either source column result in NA - -SPECIAL NOTES: - - Variable names use "X" prefix because R automatically adds it to column - names starting with numbers (5_10 becomes X5_10) - - This script depends on Script 03 being run first - - Measures interval effects within time direction (past vs future) - - Parallel to Script 06's 5.10past and 5.10fut variables but for DGEN scores - - -================================================================================ -SCRIPT 09: dataP 09 - interval x direction means.r -================================================================================ - -PURPOSE: - Calculates comprehensive mean scores by averaging item-level differences - across intervals and directions. Creates both narrow-scope means (single - time interval) and broad-scope global means (combining multiple intervals). - -VARIABLES CREATED: 11 total (6 narrow-scope + 5 global-scope) - -SOURCE COLUMNS: - All 90 difference variables created in Script 06: - - NPast_5_[domain]_[item] (15 variables) - - NPast_10_[domain]_[item] (15 variables) - - NFut_5_[domain]_[item] (15 variables) - - NFut_10_[domain]_[item] (15 variables) - - X5.10past_[domain]_[item] (15 variables) - - X5.10fut_[domain]_[item] (15 variables) - -TARGET VARIABLES: - - Narrow-Scope Means (15 source items each): - - NPast_5_mean (mean across all 15 NPast_5 items) - - NPast_10_mean (mean across all 15 NPast_10 items) - - NFut_5_mean (mean across all 15 NFut_5 items) - - NFut_10_mean (mean across all 15 NFut_10 items) - - X5.10past_mean (mean across all 15 X5.10past items) - - X5.10fut_mean (mean across all 15 X5.10fut items) - - Global-Scope Means (30 source items each): - - NPast_global_mean (NPast_5 + NPast_10: all past intervals) - - NFut_global_mean (NFut_5 + NFut_10: all future intervals) - - X5.10_global_mean (X5.10past + X5.10fut: all 5-vs-10 intervals) - - N5_global_mean (NPast_5 + NFut_5: all 5-year intervals) - - N10_global_mean (NPast_10 + NFut_10: all 10-year intervals) - -TRANSFORMATION LOGIC: - - Narrow-Scope Means (15 items each): - Each mean averages all 15 difference items within one time interval - - Example for NPast_5_mean: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel, - NPast_5_pers_extravert, NPast_5_pers_critical, - NPast_5_pers_dependable, NPast_5_pers_anxious, - NPast_5_pers_complex, - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice) - - Global-Scope Means (30 items each): - Each mean averages 30 difference items across two related intervals - - Example for NPast_global_mean: - = mean(all 15 NPast_5 items + all 15 NPast_10 items) - Represents overall perceived change from present to any past timepoint - - Example for N5_global_mean: - = mean(all 15 NPast_5 items + all 15 NFut_5 items) - Represents overall perceived change at 5-year interval regardless of - direction - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF INTERVAL × DIRECTION MEANS: - - Narrow-scope means: Single-interval summaries across all domains and items - - Global-scope means: Cross-interval summaries for testing: - * Direction effects (past vs future) - * Interval effects (5-year vs 10-year) - * Combined temporal distance effects - - Enables comprehensive analysis of temporal self-perception patterns - - Reduces item-level and domain-level noise through broad aggregation - -QUALITY ASSURANCE: - - Script includes automated QA checks for first 5 rows - - Manually recalculates each mean and verifies against stored values - - Prints TRUE/FALSE match status for each variable - - Ensures calculation accuracy before further analysis - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - All means are averages of absolute difference scores (non-negative) - - Global means provide the broadest temporal self-perception summaries - - Naming convention uses "global" for 30-item means, no suffix for 15-item - - -================================================================================ -SCRIPT 10: dataP 10 - DGEN mean vars.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging across different time combinations. - Creates means for Past, Future, and interval-based (5-year, 10-year) groupings. - -VARIABLES CREATED: 6 total - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - Direction-Based Means (2 variables): - - DGEN_past_mean (mean of past_5_mean and past_10_mean) - - DGEN_fut_mean (mean of fut_5_mean and fut_10_mean) - - Interval-Based Means (2 variables): - - DGEN_5_mean (mean of past_5_mean and fut_5_mean) - - DGEN_10_mean (mean of past_10_mean and fut_10_mean) - - Domain-Based Means (2 variables): - - DGEN_pref_mean (mean across all 4 time periods for Preferences) - - DGEN_pers_mean (mean across all 4 time periods for Personality) - -TRANSFORMATION LOGIC: - Direction-based: - - DGEN_past_mean = mean(DGEN_past_5_mean, DGEN_past_10_mean) - - DGEN_fut_mean = mean(DGEN_fut_5_mean, DGEN_fut_10_mean) - - Interval-based: - - DGEN_5_mean = mean(DGEN_past_5_mean, DGEN_fut_5_mean) - - DGEN_10_mean = mean(DGEN_past_10_mean, DGEN_fut_10_mean) - - Domain-based: - - DGEN_pref_mean = mean across all 4 Pref scores - - DGEN_pers_mean = mean across all 4 Pers scores - - NA values excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 11: dataP 11 - CORRECT ehi vars.r -================================================================================ - -PURPOSE: - Creates Enduring Hedonic Impact (EHI) variables by calculating differences - between Past and Future responses for each item across different time intervals. - Formula: NPast - NFut (positive values indicate greater past-present change) - -VARIABLES CREATED: 45 total (15 items × 3 time intervals) - -SOURCE COLUMNS: - 5-year intervals: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - 10-year intervals: - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5-10 year change: - - X5.10past_pref_read through X5.10past_val_justice (15 columns) - - X5.10fut_pref_read through X5.10fut_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year EHI Variables (15 variables): - - ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, ehi5_pref_nap, - ehi5_pref_travel - - ehi5_pers_extravert, ehi5_pers_critical, ehi5_pers_dependable, - ehi5_pers_anxious, ehi5_pers_complex - - ehi5_val_obey, ehi5_val_trad, ehi5_val_opinion, ehi5_val_performance, - ehi5_val_justice - - 10-Year EHI Variables (15 variables): - - ehi10_pref_read, ehi10_pref_music, ehi10_pref_TV, ehi10_pref_nap, - ehi10_pref_travel - - ehi10_pers_extravert, ehi10_pers_critical, ehi10_pers_dependable, - ehi10_pers_anxious, ehi10_pers_complex - - ehi10_val_obey, ehi10_val_trad, ehi10_val_opinion, ehi10_val_performance, - ehi10_val_justice - - 5-10 Year Change EHI Variables (15 variables): - - ehi5.10_pref_read, ehi5.10_pref_music, ehi5.10_pref_TV, ehi5.10_pref_nap, - ehi5.10_pref_travel - - ehi5.10_pers_extravert, ehi5.10_pers_critical, ehi5.10_pers_dependable, - ehi5.10_pers_anxious, ehi5.10_pers_complex - - ehi5.10_val_obey, ehi5.10_val_trad, ehi5.10_val_opinion, - ehi5.10_val_performance, ehi5.10_val_justice - -TRANSFORMATION LOGIC: - Formula: NPast - NFut - - All calculations use signed differences: - - ehi5_[item] = NPast_5_[item] - NFut_5_[item] - - ehi10_[item] = NPast_10_[item] - NFut_10_[item] - - ehi5.10_[item] = X5.10past_[item] - X5.10fut_[item] - - Result: Positive = greater past change, Negative = greater future change - Missing values in either source column result in NA - -QUALITY ASSURANCE: - - Comprehensive QA checks for all 45 variables across all rows - - First 5 rows displayed with detailed calculations showing source values, - computed differences, and stored values - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 12: dataP 12 - CORRECT DGEN ehi vars.r -================================================================================ - -PURPOSE: - Creates domain-general EHI variables by calculating differences between Past - and Future DGEN responses. These are the domain-general parallel to Script 11's - domain-specific EHI variables. - -VARIABLES CREATED: 6 total (3 domains × 2 time intervals) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - 5-Year DGEN EHI (3 variables): - - ehiDGEN_5_Pref - - ehiDGEN_5_Pers - - ehiDGEN_5_Val - - 10-Year DGEN EHI (3 variables): - - ehiDGEN_10_Pref - - ehiDGEN_10_Pers - - ehiDGEN_10_Val - -TRANSFORMATION LOGIC: - Formula: DGEN_past - DGEN_fut - - All calculations use signed differences: - - ehiDGEN_5_Pref = DGEN_past_5_Pref - DGEN_fut_5_Pref - - ehiDGEN_5_Pers = DGEN_past_5_Pers - DGEN_fut_5_Pers - - ehiDGEN_5_Val = DGEN_past_5_Val - DGEN_fut_5_Val - - ehiDGEN_10_Pref = DGEN_past_10_Pref - DGEN_fut_10_Pref - - ehiDGEN_10_Pers = DGEN_past_10_Pers - DGEN_fut_10_Pers - - ehiDGEN_10_Val = DGEN_past_10_Val - DGEN_fut_10_Val - - Result: Positive = greater past change, Negative = greater future change - -QUALITY ASSURANCE: - - QA checks for all 6 variables across all rows - - First 5 rows displayed with detailed calculations - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 13: datap 13 - ehi domain specific means.r -================================================================================ - -PURPOSE: - Calculates domain-level mean EHI scores by averaging the 5 items within each - domain (Preferences, Personality, Values) for each time interval. - -VARIABLES CREATED: 9 total (3 domains × 3 time intervals) - -SOURCE COLUMNS: - - ehi5_pref_read through ehi5_val_justice (15 columns) - - ehi10_pref_read through ehi10_val_justice (15 columns) - - ehi5.10_pref_read through ehi5.10_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year Domain Means (3 variables): - - ehi5_pref_MEAN (mean of 5 preference items) - - ehi5_pers_MEAN (mean of 5 personality items) - - ehi5_val_MEAN (mean of 5 values items) - - 10-Year Domain Means (3 variables): - - ehi10_pref_MEAN - - ehi10_pers_MEAN - - ehi10_val_MEAN - - 5-10 Year Change Domain Means (3 variables): - - ehi5.10_pref_MEAN - - ehi5.10_pers_MEAN - - ehi5.10_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for ehi5_pref_MEAN: - = mean(ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, - ehi5_pref_nap, ehi5_pref_travel) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - Comprehensive QA for all 9 variables across all rows - - First 5 rows displayed for multiple domain means - - Pass/Fail status for each variable - - -================================================================================ -SCRIPT 14: datap 14 - all ehi global means.r -================================================================================ - -PURPOSE: - Calculates global EHI means by averaging domain-level means. Creates the - highest-level summary scores for EHI across both domain-general and - domain-specific measures. - -VARIABLES CREATED: 5 total - -SOURCE COLUMNS: - - ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val - - ehiDGEN_10_Pref, ehiDGEN_10_Pers, ehiDGEN_10_Val - - ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN - - ehi10_pref_MEAN, ehi10_pers_MEAN, ehi10_val_MEAN - - ehi5.10_pref_MEAN, ehi5.10_pers_MEAN, ehi5.10_val_MEAN - -TARGET VARIABLES: - DGEN Global Means (2 variables): - - ehiDGEN_5_mean (mean of 3 DGEN domains for 5-year) - - ehiDGEN_10_mean (mean of 3 DGEN domains for 10-year) - - Domain-Specific Global Means (3 variables): - - ehi5_global_mean (mean of 3 domain means for 5-year) - - ehi10_global_mean (mean of 3 domain means for 10-year) - - ehi5.10_global_mean (mean of 3 domain means for 5-10 change) - -TRANSFORMATION LOGIC: - Each global mean = average of 3 domain-level scores - - Example for ehiDGEN_5_mean: - = mean(ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val) - - Example for ehi5_global_mean: - = mean(ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - QA for all 5 global means across all rows - - First 5 rows displayed with detailed calculations - - Values shown with 5 decimal precision - - Pass/Fail status for each variable - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 285 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - - Script 08: 6 variables (DGEN 5-vs-10 differences) - - Script 09: 11 variables (interval × direction means) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (22 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Mean scores: 4 (1 per time period) - * 5-vs-10 differences: 6 (3 domains × 2 directions) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - - Interval × Direction Means (11 total): - * Narrow-scope means: 6 (NPast_5, NPast_10, NFut_5, NFut_10, - X5.10past, X5.10fut) - * Global-scope means: 5 (NPast_global, NFut_global, X5.10_global, - N5_global, N10_global) - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 09) as later scripts depend - on variables created by earlier scripts. - - Key Dependencies: - - Script 03 required before Script 04 (DGEN means need DGEN scores) - - Script 03 required before Script 08 (DGEN 5-vs-10 need DGEN scores) - - Script 06 required before Script 07 (domain means need differences) - - Script 06 required before Script 09 (interval × direction means need - differences) - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with random row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - - Script 09 includes comprehensive QA with first 5 rows for all 11 variables - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Scripts 08 and 09 save directly to eohi2.csv - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - "X" prefix added to variables starting with numbers (X5.10past_mean) - - Global means use "_global_" to distinguish from narrow-scope means - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 8, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251008171453.txt b/.history/eohi2/README_Variable_Creation_20251008171453.txt deleted file mode 100644 index b056cf2..0000000 --- a/.history/eohi2/README_Variable_Creation_20251008171453.txt +++ /dev/null @@ -1,926 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 09 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SCRIPT 08: dataP 08 - DGEN 510 vars.r -================================================================================ - -PURPOSE: - Calculates absolute differences between 5-year and 10-year DGEN ratings for - both Past and Future time directions. These variables measure the perceived - difference in domain-general change between the two time intervals. - -VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - Total: 12 DGEN columns (created in Script 03) - -TARGET VARIABLES: - Past Direction (3 variables): - - X5_10DGEN_past_pref (|DGEN_past_5_Pref - DGEN_past_10_Pref|) - - X5_10DGEN_past_pers (|DGEN_past_5_Pers - DGEN_past_10_Pers|) - - X5_10DGEN_past_val (|DGEN_past_5_Val - DGEN_past_10_Val|) - - Future Direction (3 variables): - - X5_10DGEN_fut_pref (|DGEN_fut_5_Pref - DGEN_fut_10_Pref|) - - X5_10DGEN_fut_pers (|DGEN_fut_5_Pers - DGEN_fut_10_Pers|) - - X5_10DGEN_fut_val (|DGEN_fut_5_Val - DGEN_fut_10_Val|) - -TRANSFORMATION LOGIC: - Formula: |DGEN_5 - DGEN_10| - - All calculations use absolute differences: - - Past Preferences: |DGEN_past_5_Pref - DGEN_past_10_Pref| - - Past Personality: |DGEN_past_5_Pers - DGEN_past_10_Pers| - - Past Values: |DGEN_past_5_Val - DGEN_past_10_Val| - - Future Preferences: |DGEN_fut_5_Pref - DGEN_fut_10_Pref| - - Future Personality: |DGEN_fut_5_Pers - DGEN_fut_10_Pers| - - Future Values: |DGEN_fut_5_Val - DGEN_fut_10_Val| - - Result: Always positive values representing magnitude of difference - Missing values in either source column result in NA - -SPECIAL NOTES: - - Variable names use "X" prefix because R automatically adds it to column - names starting with numbers (5_10 becomes X5_10) - - This script depends on Script 03 being run first - - Measures interval effects within time direction (past vs future) - - Parallel to Script 06's 5.10past and 5.10fut variables but for DGEN scores - - -================================================================================ -SCRIPT 09: dataP 09 - interval x direction means.r -================================================================================ - -PURPOSE: - Calculates comprehensive mean scores by averaging item-level differences - across intervals and directions. Creates both narrow-scope means (single - time interval) and broad-scope global means (combining multiple intervals). - -VARIABLES CREATED: 11 total (6 narrow-scope + 5 global-scope) - -SOURCE COLUMNS: - All 90 difference variables created in Script 06: - - NPast_5_[domain]_[item] (15 variables) - - NPast_10_[domain]_[item] (15 variables) - - NFut_5_[domain]_[item] (15 variables) - - NFut_10_[domain]_[item] (15 variables) - - X5.10past_[domain]_[item] (15 variables) - - X5.10fut_[domain]_[item] (15 variables) - -TARGET VARIABLES: - - Narrow-Scope Means (15 source items each): - - NPast_5_mean (mean across all 15 NPast_5 items) - - NPast_10_mean (mean across all 15 NPast_10 items) - - NFut_5_mean (mean across all 15 NFut_5 items) - - NFut_10_mean (mean across all 15 NFut_10 items) - - X5.10past_mean (mean across all 15 X5.10past items) - - X5.10fut_mean (mean across all 15 X5.10fut items) - - Global-Scope Means (30 source items each): - - NPast_global_mean (NPast_5 + NPast_10: all past intervals) - - NFut_global_mean (NFut_5 + NFut_10: all future intervals) - - X5.10_global_mean (X5.10past + X5.10fut: all 5-vs-10 intervals) - - N5_global_mean (NPast_5 + NFut_5: all 5-year intervals) - - N10_global_mean (NPast_10 + NFut_10: all 10-year intervals) - -TRANSFORMATION LOGIC: - - Narrow-Scope Means (15 items each): - Each mean averages all 15 difference items within one time interval - - Example for NPast_5_mean: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel, - NPast_5_pers_extravert, NPast_5_pers_critical, - NPast_5_pers_dependable, NPast_5_pers_anxious, - NPast_5_pers_complex, - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice) - - Global-Scope Means (30 items each): - Each mean averages 30 difference items across two related intervals - - Example for NPast_global_mean: - = mean(all 15 NPast_5 items + all 15 NPast_10 items) - Represents overall perceived change from present to any past timepoint - - Example for N5_global_mean: - = mean(all 15 NPast_5 items + all 15 NFut_5 items) - Represents overall perceived change at 5-year interval regardless of - direction - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF INTERVAL × DIRECTION MEANS: - - Narrow-scope means: Single-interval summaries across all domains and items - - Global-scope means: Cross-interval summaries for testing: - * Direction effects (past vs future) - * Interval effects (5-year vs 10-year) - * Combined temporal distance effects - - Enables comprehensive analysis of temporal self-perception patterns - - Reduces item-level and domain-level noise through broad aggregation - -QUALITY ASSURANCE: - - Script includes automated QA checks for first 5 rows - - Manually recalculates each mean and verifies against stored values - - Prints TRUE/FALSE match status for each variable - - Ensures calculation accuracy before further analysis - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - All means are averages of absolute difference scores (non-negative) - - Global means provide the broadest temporal self-perception summaries - - Naming convention uses "global" for 30-item means, no suffix for 15-item - - -================================================================================ -SCRIPT 10: dataP 10 - DGEN mean vars.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging across different time combinations. - Creates means for Past, Future, and interval-based (5-year, 10-year) groupings. - -VARIABLES CREATED: 6 total - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - Direction-Based Means (2 variables): - - DGEN_past_mean (mean of past_5_mean and past_10_mean) - - DGEN_fut_mean (mean of fut_5_mean and fut_10_mean) - - Interval-Based Means (2 variables): - - DGEN_5_mean (mean of past_5_mean and fut_5_mean) - - DGEN_10_mean (mean of past_10_mean and fut_10_mean) - - Domain-Based Means (2 variables): - - DGEN_pref_mean (mean across all 4 time periods for Preferences) - - DGEN_pers_mean (mean across all 4 time periods for Personality) - -TRANSFORMATION LOGIC: - Direction-based: - - DGEN_past_mean = mean(DGEN_past_5_mean, DGEN_past_10_mean) - - DGEN_fut_mean = mean(DGEN_fut_5_mean, DGEN_fut_10_mean) - - Interval-based: - - DGEN_5_mean = mean(DGEN_past_5_mean, DGEN_fut_5_mean) - - DGEN_10_mean = mean(DGEN_past_10_mean, DGEN_fut_10_mean) - - Domain-based: - - DGEN_pref_mean = mean across all 4 Pref scores - - DGEN_pers_mean = mean across all 4 Pers scores - - NA values excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 11: dataP 11 - CORRECT ehi vars.r -================================================================================ - -PURPOSE: - Creates Enduring Hedonic Impact (EHI) variables by calculating differences - between Past and Future responses for each item across different time intervals. - Formula: NPast - NFut (positive values indicate greater past-present change) - -VARIABLES CREATED: 45 total (15 items × 3 time intervals) - -SOURCE COLUMNS: - 5-year intervals: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - 10-year intervals: - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5-10 year change: - - X5.10past_pref_read through X5.10past_val_justice (15 columns) - - X5.10fut_pref_read through X5.10fut_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year EHI Variables (15 variables): - - ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, ehi5_pref_nap, - ehi5_pref_travel - - ehi5_pers_extravert, ehi5_pers_critical, ehi5_pers_dependable, - ehi5_pers_anxious, ehi5_pers_complex - - ehi5_val_obey, ehi5_val_trad, ehi5_val_opinion, ehi5_val_performance, - ehi5_val_justice - - 10-Year EHI Variables (15 variables): - - ehi10_pref_read, ehi10_pref_music, ehi10_pref_TV, ehi10_pref_nap, - ehi10_pref_travel - - ehi10_pers_extravert, ehi10_pers_critical, ehi10_pers_dependable, - ehi10_pers_anxious, ehi10_pers_complex - - ehi10_val_obey, ehi10_val_trad, ehi10_val_opinion, ehi10_val_performance, - ehi10_val_justice - - 5-10 Year Change EHI Variables (15 variables): - - ehi5.10_pref_read, ehi5.10_pref_music, ehi5.10_pref_TV, ehi5.10_pref_nap, - ehi5.10_pref_travel - - ehi5.10_pers_extravert, ehi5.10_pers_critical, ehi5.10_pers_dependable, - ehi5.10_pers_anxious, ehi5.10_pers_complex - - ehi5.10_val_obey, ehi5.10_val_trad, ehi5.10_val_opinion, - ehi5.10_val_performance, ehi5.10_val_justice - -TRANSFORMATION LOGIC: - Formula: NPast - NFut - - All calculations use signed differences: - - ehi5_[item] = NPast_5_[item] - NFut_5_[item] - - ehi10_[item] = NPast_10_[item] - NFut_10_[item] - - ehi5.10_[item] = X5.10past_[item] - X5.10fut_[item] - - Result: Positive = greater past change, Negative = greater future change - Missing values in either source column result in NA - -QUALITY ASSURANCE: - - Comprehensive QA checks for all 45 variables across all rows - - First 5 rows displayed with detailed calculations showing source values, - computed differences, and stored values - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 12: dataP 12 - CORRECT DGEN ehi vars.r -================================================================================ - -PURPOSE: - Creates domain-general EHI variables by calculating differences between Past - and Future DGEN responses. These are the domain-general parallel to Script 11's - domain-specific EHI variables. - -VARIABLES CREATED: 6 total (3 domains × 2 time intervals) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - 5-Year DGEN EHI (3 variables): - - ehiDGEN_5_Pref - - ehiDGEN_5_Pers - - ehiDGEN_5_Val - - 10-Year DGEN EHI (3 variables): - - ehiDGEN_10_Pref - - ehiDGEN_10_Pers - - ehiDGEN_10_Val - -TRANSFORMATION LOGIC: - Formula: DGEN_past - DGEN_fut - - All calculations use signed differences: - - ehiDGEN_5_Pref = DGEN_past_5_Pref - DGEN_fut_5_Pref - - ehiDGEN_5_Pers = DGEN_past_5_Pers - DGEN_fut_5_Pers - - ehiDGEN_5_Val = DGEN_past_5_Val - DGEN_fut_5_Val - - ehiDGEN_10_Pref = DGEN_past_10_Pref - DGEN_fut_10_Pref - - ehiDGEN_10_Pers = DGEN_past_10_Pers - DGEN_fut_10_Pers - - ehiDGEN_10_Val = DGEN_past_10_Val - DGEN_fut_10_Val - - Result: Positive = greater past change, Negative = greater future change - -QUALITY ASSURANCE: - - QA checks for all 6 variables across all rows - - First 5 rows displayed with detailed calculations - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 13: datap 13 - ehi domain specific means.r -================================================================================ - -PURPOSE: - Calculates domain-level mean EHI scores by averaging the 5 items within each - domain (Preferences, Personality, Values) for each time interval. - -VARIABLES CREATED: 9 total (3 domains × 3 time intervals) - -SOURCE COLUMNS: - - ehi5_pref_read through ehi5_val_justice (15 columns) - - ehi10_pref_read through ehi10_val_justice (15 columns) - - ehi5.10_pref_read through ehi5.10_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year Domain Means (3 variables): - - ehi5_pref_MEAN (mean of 5 preference items) - - ehi5_pers_MEAN (mean of 5 personality items) - - ehi5_val_MEAN (mean of 5 values items) - - 10-Year Domain Means (3 variables): - - ehi10_pref_MEAN - - ehi10_pers_MEAN - - ehi10_val_MEAN - - 5-10 Year Change Domain Means (3 variables): - - ehi5.10_pref_MEAN - - ehi5.10_pers_MEAN - - ehi5.10_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for ehi5_pref_MEAN: - = mean(ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, - ehi5_pref_nap, ehi5_pref_travel) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - Comprehensive QA for all 9 variables across all rows - - First 5 rows displayed for multiple domain means - - Pass/Fail status for each variable - - -================================================================================ -SCRIPT 14: datap 14 - all ehi global means.r -================================================================================ - -PURPOSE: - Calculates global EHI means by averaging domain-level means. Creates the - highest-level summary scores for EHI across both domain-general and - domain-specific measures. - -VARIABLES CREATED: 5 total - -SOURCE COLUMNS: - - ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val - - ehiDGEN_10_Pref, ehiDGEN_10_Pers, ehiDGEN_10_Val - - ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN - - ehi10_pref_MEAN, ehi10_pers_MEAN, ehi10_val_MEAN - - ehi5.10_pref_MEAN, ehi5.10_pers_MEAN, ehi5.10_val_MEAN - -TARGET VARIABLES: - DGEN Global Means (2 variables): - - ehiDGEN_5_mean (mean of 3 DGEN domains for 5-year) - - ehiDGEN_10_mean (mean of 3 DGEN domains for 10-year) - - Domain-Specific Global Means (3 variables): - - ehi5_global_mean (mean of 3 domain means for 5-year) - - ehi10_global_mean (mean of 3 domain means for 10-year) - - ehi5.10_global_mean (mean of 3 domain means for 5-10 change) - -TRANSFORMATION LOGIC: - Each global mean = average of 3 domain-level scores - - Example for ehiDGEN_5_mean: - = mean(ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val) - - Example for ehi5_global_mean: - = mean(ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - QA for all 5 global means across all rows - - First 5 rows displayed with detailed calculations - - Values shown with 5 decimal precision - - Pass/Fail status for each variable - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 285 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN time period means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - - Script 08: 6 variables (DGEN 5-vs-10 differences) - - Script 09: 11 variables (interval × direction means) - - Script 10: 6 variables (DGEN combined means) - - Script 11: 45 variables (domain-specific EHI scores) - - Script 12: 6 variables (DGEN EHI scores) - - Script 13: 9 variables (EHI domain means) - - Script 14: 5 variables (EHI global means) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (22 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Mean scores: 4 (1 per time period) - * 5-vs-10 differences: 6 (3 domains × 2 directions) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - - Interval × Direction Means (11 total): - * Narrow-scope means: 6 (NPast_5, NPast_10, NFut_5, NFut_10, - X5.10past, X5.10fut) - * Global-scope means: 5 (NPast_global, NFut_global, X5.10_global, - N5_global, N10_global) - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 09) as later scripts depend - on variables created by earlier scripts. - - Key Dependencies: - - Script 03 required before Script 04 (DGEN means need DGEN scores) - - Script 03 required before Script 08 (DGEN 5-vs-10 need DGEN scores) - - Script 06 required before Script 07 (domain means need differences) - - Script 06 required before Script 09 (interval × direction means need - differences) - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with random row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - - Script 09 includes comprehensive QA with first 5 rows for all 11 variables - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Scripts 08 and 09 save directly to eohi2.csv - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - "X" prefix added to variables starting with numbers (X5.10past_mean) - - Global means use "_global_" to distinguish from narrow-scope means - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 8, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251008171510.txt b/.history/eohi2/README_Variable_Creation_20251008171510.txt deleted file mode 100644 index 954a538..0000000 --- a/.history/eohi2/README_Variable_Creation_20251008171510.txt +++ /dev/null @@ -1,933 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 09 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SCRIPT 08: dataP 08 - DGEN 510 vars.r -================================================================================ - -PURPOSE: - Calculates absolute differences between 5-year and 10-year DGEN ratings for - both Past and Future time directions. These variables measure the perceived - difference in domain-general change between the two time intervals. - -VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - Total: 12 DGEN columns (created in Script 03) - -TARGET VARIABLES: - Past Direction (3 variables): - - X5_10DGEN_past_pref (|DGEN_past_5_Pref - DGEN_past_10_Pref|) - - X5_10DGEN_past_pers (|DGEN_past_5_Pers - DGEN_past_10_Pers|) - - X5_10DGEN_past_val (|DGEN_past_5_Val - DGEN_past_10_Val|) - - Future Direction (3 variables): - - X5_10DGEN_fut_pref (|DGEN_fut_5_Pref - DGEN_fut_10_Pref|) - - X5_10DGEN_fut_pers (|DGEN_fut_5_Pers - DGEN_fut_10_Pers|) - - X5_10DGEN_fut_val (|DGEN_fut_5_Val - DGEN_fut_10_Val|) - -TRANSFORMATION LOGIC: - Formula: |DGEN_5 - DGEN_10| - - All calculations use absolute differences: - - Past Preferences: |DGEN_past_5_Pref - DGEN_past_10_Pref| - - Past Personality: |DGEN_past_5_Pers - DGEN_past_10_Pers| - - Past Values: |DGEN_past_5_Val - DGEN_past_10_Val| - - Future Preferences: |DGEN_fut_5_Pref - DGEN_fut_10_Pref| - - Future Personality: |DGEN_fut_5_Pers - DGEN_fut_10_Pers| - - Future Values: |DGEN_fut_5_Val - DGEN_fut_10_Val| - - Result: Always positive values representing magnitude of difference - Missing values in either source column result in NA - -SPECIAL NOTES: - - Variable names use "X" prefix because R automatically adds it to column - names starting with numbers (5_10 becomes X5_10) - - This script depends on Script 03 being run first - - Measures interval effects within time direction (past vs future) - - Parallel to Script 06's 5.10past and 5.10fut variables but for DGEN scores - - -================================================================================ -SCRIPT 09: dataP 09 - interval x direction means.r -================================================================================ - -PURPOSE: - Calculates comprehensive mean scores by averaging item-level differences - across intervals and directions. Creates both narrow-scope means (single - time interval) and broad-scope global means (combining multiple intervals). - -VARIABLES CREATED: 11 total (6 narrow-scope + 5 global-scope) - -SOURCE COLUMNS: - All 90 difference variables created in Script 06: - - NPast_5_[domain]_[item] (15 variables) - - NPast_10_[domain]_[item] (15 variables) - - NFut_5_[domain]_[item] (15 variables) - - NFut_10_[domain]_[item] (15 variables) - - X5.10past_[domain]_[item] (15 variables) - - X5.10fut_[domain]_[item] (15 variables) - -TARGET VARIABLES: - - Narrow-Scope Means (15 source items each): - - NPast_5_mean (mean across all 15 NPast_5 items) - - NPast_10_mean (mean across all 15 NPast_10 items) - - NFut_5_mean (mean across all 15 NFut_5 items) - - NFut_10_mean (mean across all 15 NFut_10 items) - - X5.10past_mean (mean across all 15 X5.10past items) - - X5.10fut_mean (mean across all 15 X5.10fut items) - - Global-Scope Means (30 source items each): - - NPast_global_mean (NPast_5 + NPast_10: all past intervals) - - NFut_global_mean (NFut_5 + NFut_10: all future intervals) - - X5.10_global_mean (X5.10past + X5.10fut: all 5-vs-10 intervals) - - N5_global_mean (NPast_5 + NFut_5: all 5-year intervals) - - N10_global_mean (NPast_10 + NFut_10: all 10-year intervals) - -TRANSFORMATION LOGIC: - - Narrow-Scope Means (15 items each): - Each mean averages all 15 difference items within one time interval - - Example for NPast_5_mean: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel, - NPast_5_pers_extravert, NPast_5_pers_critical, - NPast_5_pers_dependable, NPast_5_pers_anxious, - NPast_5_pers_complex, - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice) - - Global-Scope Means (30 items each): - Each mean averages 30 difference items across two related intervals - - Example for NPast_global_mean: - = mean(all 15 NPast_5 items + all 15 NPast_10 items) - Represents overall perceived change from present to any past timepoint - - Example for N5_global_mean: - = mean(all 15 NPast_5 items + all 15 NFut_5 items) - Represents overall perceived change at 5-year interval regardless of - direction - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF INTERVAL × DIRECTION MEANS: - - Narrow-scope means: Single-interval summaries across all domains and items - - Global-scope means: Cross-interval summaries for testing: - * Direction effects (past vs future) - * Interval effects (5-year vs 10-year) - * Combined temporal distance effects - - Enables comprehensive analysis of temporal self-perception patterns - - Reduces item-level and domain-level noise through broad aggregation - -QUALITY ASSURANCE: - - Script includes automated QA checks for first 5 rows - - Manually recalculates each mean and verifies against stored values - - Prints TRUE/FALSE match status for each variable - - Ensures calculation accuracy before further analysis - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - All means are averages of absolute difference scores (non-negative) - - Global means provide the broadest temporal self-perception summaries - - Naming convention uses "global" for 30-item means, no suffix for 15-item - - -================================================================================ -SCRIPT 10: dataP 10 - DGEN mean vars.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging across different time combinations. - Creates means for Past, Future, and interval-based (5-year, 10-year) groupings. - -VARIABLES CREATED: 6 total - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - Direction-Based Means (2 variables): - - DGEN_past_mean (mean of past_5_mean and past_10_mean) - - DGEN_fut_mean (mean of fut_5_mean and fut_10_mean) - - Interval-Based Means (2 variables): - - DGEN_5_mean (mean of past_5_mean and fut_5_mean) - - DGEN_10_mean (mean of past_10_mean and fut_10_mean) - - Domain-Based Means (2 variables): - - DGEN_pref_mean (mean across all 4 time periods for Preferences) - - DGEN_pers_mean (mean across all 4 time periods for Personality) - -TRANSFORMATION LOGIC: - Direction-based: - - DGEN_past_mean = mean(DGEN_past_5_mean, DGEN_past_10_mean) - - DGEN_fut_mean = mean(DGEN_fut_5_mean, DGEN_fut_10_mean) - - Interval-based: - - DGEN_5_mean = mean(DGEN_past_5_mean, DGEN_fut_5_mean) - - DGEN_10_mean = mean(DGEN_past_10_mean, DGEN_fut_10_mean) - - Domain-based: - - DGEN_pref_mean = mean across all 4 Pref scores - - DGEN_pers_mean = mean across all 4 Pers scores - - NA values excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 11: dataP 11 - CORRECT ehi vars.r -================================================================================ - -PURPOSE: - Creates Enduring Hedonic Impact (EHI) variables by calculating differences - between Past and Future responses for each item across different time intervals. - Formula: NPast - NFut (positive values indicate greater past-present change) - -VARIABLES CREATED: 45 total (15 items × 3 time intervals) - -SOURCE COLUMNS: - 5-year intervals: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - 10-year intervals: - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5-10 year change: - - X5.10past_pref_read through X5.10past_val_justice (15 columns) - - X5.10fut_pref_read through X5.10fut_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year EHI Variables (15 variables): - - ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, ehi5_pref_nap, - ehi5_pref_travel - - ehi5_pers_extravert, ehi5_pers_critical, ehi5_pers_dependable, - ehi5_pers_anxious, ehi5_pers_complex - - ehi5_val_obey, ehi5_val_trad, ehi5_val_opinion, ehi5_val_performance, - ehi5_val_justice - - 10-Year EHI Variables (15 variables): - - ehi10_pref_read, ehi10_pref_music, ehi10_pref_TV, ehi10_pref_nap, - ehi10_pref_travel - - ehi10_pers_extravert, ehi10_pers_critical, ehi10_pers_dependable, - ehi10_pers_anxious, ehi10_pers_complex - - ehi10_val_obey, ehi10_val_trad, ehi10_val_opinion, ehi10_val_performance, - ehi10_val_justice - - 5-10 Year Change EHI Variables (15 variables): - - ehi5.10_pref_read, ehi5.10_pref_music, ehi5.10_pref_TV, ehi5.10_pref_nap, - ehi5.10_pref_travel - - ehi5.10_pers_extravert, ehi5.10_pers_critical, ehi5.10_pers_dependable, - ehi5.10_pers_anxious, ehi5.10_pers_complex - - ehi5.10_val_obey, ehi5.10_val_trad, ehi5.10_val_opinion, - ehi5.10_val_performance, ehi5.10_val_justice - -TRANSFORMATION LOGIC: - Formula: NPast - NFut - - All calculations use signed differences: - - ehi5_[item] = NPast_5_[item] - NFut_5_[item] - - ehi10_[item] = NPast_10_[item] - NFut_10_[item] - - ehi5.10_[item] = X5.10past_[item] - X5.10fut_[item] - - Result: Positive = greater past change, Negative = greater future change - Missing values in either source column result in NA - -QUALITY ASSURANCE: - - Comprehensive QA checks for all 45 variables across all rows - - First 5 rows displayed with detailed calculations showing source values, - computed differences, and stored values - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 12: dataP 12 - CORRECT DGEN ehi vars.r -================================================================================ - -PURPOSE: - Creates domain-general EHI variables by calculating differences between Past - and Future DGEN responses. These are the domain-general parallel to Script 11's - domain-specific EHI variables. - -VARIABLES CREATED: 6 total (3 domains × 2 time intervals) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - 5-Year DGEN EHI (3 variables): - - ehiDGEN_5_Pref - - ehiDGEN_5_Pers - - ehiDGEN_5_Val - - 10-Year DGEN EHI (3 variables): - - ehiDGEN_10_Pref - - ehiDGEN_10_Pers - - ehiDGEN_10_Val - -TRANSFORMATION LOGIC: - Formula: DGEN_past - DGEN_fut - - All calculations use signed differences: - - ehiDGEN_5_Pref = DGEN_past_5_Pref - DGEN_fut_5_Pref - - ehiDGEN_5_Pers = DGEN_past_5_Pers - DGEN_fut_5_Pers - - ehiDGEN_5_Val = DGEN_past_5_Val - DGEN_fut_5_Val - - ehiDGEN_10_Pref = DGEN_past_10_Pref - DGEN_fut_10_Pref - - ehiDGEN_10_Pers = DGEN_past_10_Pers - DGEN_fut_10_Pers - - ehiDGEN_10_Val = DGEN_past_10_Val - DGEN_fut_10_Val - - Result: Positive = greater past change, Negative = greater future change - -QUALITY ASSURANCE: - - QA checks for all 6 variables across all rows - - First 5 rows displayed with detailed calculations - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 13: datap 13 - ehi domain specific means.r -================================================================================ - -PURPOSE: - Calculates domain-level mean EHI scores by averaging the 5 items within each - domain (Preferences, Personality, Values) for each time interval. - -VARIABLES CREATED: 9 total (3 domains × 3 time intervals) - -SOURCE COLUMNS: - - ehi5_pref_read through ehi5_val_justice (15 columns) - - ehi10_pref_read through ehi10_val_justice (15 columns) - - ehi5.10_pref_read through ehi5.10_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year Domain Means (3 variables): - - ehi5_pref_MEAN (mean of 5 preference items) - - ehi5_pers_MEAN (mean of 5 personality items) - - ehi5_val_MEAN (mean of 5 values items) - - 10-Year Domain Means (3 variables): - - ehi10_pref_MEAN - - ehi10_pers_MEAN - - ehi10_val_MEAN - - 5-10 Year Change Domain Means (3 variables): - - ehi5.10_pref_MEAN - - ehi5.10_pers_MEAN - - ehi5.10_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for ehi5_pref_MEAN: - = mean(ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, - ehi5_pref_nap, ehi5_pref_travel) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - Comprehensive QA for all 9 variables across all rows - - First 5 rows displayed for multiple domain means - - Pass/Fail status for each variable - - -================================================================================ -SCRIPT 14: datap 14 - all ehi global means.r -================================================================================ - -PURPOSE: - Calculates global EHI means by averaging domain-level means. Creates the - highest-level summary scores for EHI across both domain-general and - domain-specific measures. - -VARIABLES CREATED: 5 total - -SOURCE COLUMNS: - - ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val - - ehiDGEN_10_Pref, ehiDGEN_10_Pers, ehiDGEN_10_Val - - ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN - - ehi10_pref_MEAN, ehi10_pers_MEAN, ehi10_val_MEAN - - ehi5.10_pref_MEAN, ehi5.10_pers_MEAN, ehi5.10_val_MEAN - -TARGET VARIABLES: - DGEN Global Means (2 variables): - - ehiDGEN_5_mean (mean of 3 DGEN domains for 5-year) - - ehiDGEN_10_mean (mean of 3 DGEN domains for 10-year) - - Domain-Specific Global Means (3 variables): - - ehi5_global_mean (mean of 3 domain means for 5-year) - - ehi10_global_mean (mean of 3 domain means for 10-year) - - ehi5.10_global_mean (mean of 3 domain means for 5-10 change) - -TRANSFORMATION LOGIC: - Each global mean = average of 3 domain-level scores - - Example for ehiDGEN_5_mean: - = mean(ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val) - - Example for ehi5_global_mean: - = mean(ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - QA for all 5 global means across all rows - - First 5 rows displayed with detailed calculations - - Values shown with 5 decimal precision - - Pass/Fail status for each variable - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 285 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN time period means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - - Script 08: 6 variables (DGEN 5-vs-10 differences) - - Script 09: 11 variables (interval × direction means) - - Script 10: 6 variables (DGEN combined means) - - Script 11: 45 variables (domain-specific EHI scores) - - Script 12: 6 variables (DGEN EHI scores) - - Script 13: 9 variables (EHI domain means) - - Script 14: 5 variables (EHI global means) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (28 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Time period means: 4 (1 per time period) - * 5-vs-10 differences: 6 (3 domains × 2 directions) - * Combined means: 6 (past, future, interval-based, domain-based) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - - Interval × Direction Means (11 total): - * Narrow-scope means: 6 (NPast_5, NPast_10, NFut_5, NFut_10, - X5.10past, X5.10fut) - * Global-scope means: 5 (NPast_global, NFut_global, X5.10_global, - N5_global, N10_global) - - - EHI Variables (60 total): - * Domain-specific EHI: 45 (15 items × 3 time intervals) - * DGEN EHI: 6 (3 domains × 2 time intervals) - * Domain means: 9 (3 domains × 3 time intervals) - * Global means: 5 (2 DGEN + 3 domain-specific) - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 09) as later scripts depend - on variables created by earlier scripts. - - Key Dependencies: - - Script 03 required before Script 04 (DGEN means need DGEN scores) - - Script 03 required before Script 08 (DGEN 5-vs-10 need DGEN scores) - - Script 06 required before Script 07 (domain means need differences) - - Script 06 required before Script 09 (interval × direction means need - differences) - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with random row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - - Script 09 includes comprehensive QA with first 5 rows for all 11 variables - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Scripts 08 and 09 save directly to eohi2.csv - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - "X" prefix added to variables starting with numbers (X5.10past_mean) - - Global means use "_global_" to distinguish from narrow-scope means - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 8, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251008171520.txt b/.history/eohi2/README_Variable_Creation_20251008171520.txt deleted file mode 100644 index 756cdf7..0000000 --- a/.history/eohi2/README_Variable_Creation_20251008171520.txt +++ /dev/null @@ -1,934 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 09 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SCRIPT 08: dataP 08 - DGEN 510 vars.r -================================================================================ - -PURPOSE: - Calculates absolute differences between 5-year and 10-year DGEN ratings for - both Past and Future time directions. These variables measure the perceived - difference in domain-general change between the two time intervals. - -VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - Total: 12 DGEN columns (created in Script 03) - -TARGET VARIABLES: - Past Direction (3 variables): - - X5_10DGEN_past_pref (|DGEN_past_5_Pref - DGEN_past_10_Pref|) - - X5_10DGEN_past_pers (|DGEN_past_5_Pers - DGEN_past_10_Pers|) - - X5_10DGEN_past_val (|DGEN_past_5_Val - DGEN_past_10_Val|) - - Future Direction (3 variables): - - X5_10DGEN_fut_pref (|DGEN_fut_5_Pref - DGEN_fut_10_Pref|) - - X5_10DGEN_fut_pers (|DGEN_fut_5_Pers - DGEN_fut_10_Pers|) - - X5_10DGEN_fut_val (|DGEN_fut_5_Val - DGEN_fut_10_Val|) - -TRANSFORMATION LOGIC: - Formula: |DGEN_5 - DGEN_10| - - All calculations use absolute differences: - - Past Preferences: |DGEN_past_5_Pref - DGEN_past_10_Pref| - - Past Personality: |DGEN_past_5_Pers - DGEN_past_10_Pers| - - Past Values: |DGEN_past_5_Val - DGEN_past_10_Val| - - Future Preferences: |DGEN_fut_5_Pref - DGEN_fut_10_Pref| - - Future Personality: |DGEN_fut_5_Pers - DGEN_fut_10_Pers| - - Future Values: |DGEN_fut_5_Val - DGEN_fut_10_Val| - - Result: Always positive values representing magnitude of difference - Missing values in either source column result in NA - -SPECIAL NOTES: - - Variable names use "X" prefix because R automatically adds it to column - names starting with numbers (5_10 becomes X5_10) - - This script depends on Script 03 being run first - - Measures interval effects within time direction (past vs future) - - Parallel to Script 06's 5.10past and 5.10fut variables but for DGEN scores - - -================================================================================ -SCRIPT 09: dataP 09 - interval x direction means.r -================================================================================ - -PURPOSE: - Calculates comprehensive mean scores by averaging item-level differences - across intervals and directions. Creates both narrow-scope means (single - time interval) and broad-scope global means (combining multiple intervals). - -VARIABLES CREATED: 11 total (6 narrow-scope + 5 global-scope) - -SOURCE COLUMNS: - All 90 difference variables created in Script 06: - - NPast_5_[domain]_[item] (15 variables) - - NPast_10_[domain]_[item] (15 variables) - - NFut_5_[domain]_[item] (15 variables) - - NFut_10_[domain]_[item] (15 variables) - - X5.10past_[domain]_[item] (15 variables) - - X5.10fut_[domain]_[item] (15 variables) - -TARGET VARIABLES: - - Narrow-Scope Means (15 source items each): - - NPast_5_mean (mean across all 15 NPast_5 items) - - NPast_10_mean (mean across all 15 NPast_10 items) - - NFut_5_mean (mean across all 15 NFut_5 items) - - NFut_10_mean (mean across all 15 NFut_10 items) - - X5.10past_mean (mean across all 15 X5.10past items) - - X5.10fut_mean (mean across all 15 X5.10fut items) - - Global-Scope Means (30 source items each): - - NPast_global_mean (NPast_5 + NPast_10: all past intervals) - - NFut_global_mean (NFut_5 + NFut_10: all future intervals) - - X5.10_global_mean (X5.10past + X5.10fut: all 5-vs-10 intervals) - - N5_global_mean (NPast_5 + NFut_5: all 5-year intervals) - - N10_global_mean (NPast_10 + NFut_10: all 10-year intervals) - -TRANSFORMATION LOGIC: - - Narrow-Scope Means (15 items each): - Each mean averages all 15 difference items within one time interval - - Example for NPast_5_mean: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel, - NPast_5_pers_extravert, NPast_5_pers_critical, - NPast_5_pers_dependable, NPast_5_pers_anxious, - NPast_5_pers_complex, - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice) - - Global-Scope Means (30 items each): - Each mean averages 30 difference items across two related intervals - - Example for NPast_global_mean: - = mean(all 15 NPast_5 items + all 15 NPast_10 items) - Represents overall perceived change from present to any past timepoint - - Example for N5_global_mean: - = mean(all 15 NPast_5 items + all 15 NFut_5 items) - Represents overall perceived change at 5-year interval regardless of - direction - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF INTERVAL × DIRECTION MEANS: - - Narrow-scope means: Single-interval summaries across all domains and items - - Global-scope means: Cross-interval summaries for testing: - * Direction effects (past vs future) - * Interval effects (5-year vs 10-year) - * Combined temporal distance effects - - Enables comprehensive analysis of temporal self-perception patterns - - Reduces item-level and domain-level noise through broad aggregation - -QUALITY ASSURANCE: - - Script includes automated QA checks for first 5 rows - - Manually recalculates each mean and verifies against stored values - - Prints TRUE/FALSE match status for each variable - - Ensures calculation accuracy before further analysis - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - All means are averages of absolute difference scores (non-negative) - - Global means provide the broadest temporal self-perception summaries - - Naming convention uses "global" for 30-item means, no suffix for 15-item - - -================================================================================ -SCRIPT 10: dataP 10 - DGEN mean vars.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging across different time combinations. - Creates means for Past, Future, and interval-based (5-year, 10-year) groupings. - -VARIABLES CREATED: 6 total - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - Direction-Based Means (2 variables): - - DGEN_past_mean (mean of past_5_mean and past_10_mean) - - DGEN_fut_mean (mean of fut_5_mean and fut_10_mean) - - Interval-Based Means (2 variables): - - DGEN_5_mean (mean of past_5_mean and fut_5_mean) - - DGEN_10_mean (mean of past_10_mean and fut_10_mean) - - Domain-Based Means (2 variables): - - DGEN_pref_mean (mean across all 4 time periods for Preferences) - - DGEN_pers_mean (mean across all 4 time periods for Personality) - -TRANSFORMATION LOGIC: - Direction-based: - - DGEN_past_mean = mean(DGEN_past_5_mean, DGEN_past_10_mean) - - DGEN_fut_mean = mean(DGEN_fut_5_mean, DGEN_fut_10_mean) - - Interval-based: - - DGEN_5_mean = mean(DGEN_past_5_mean, DGEN_fut_5_mean) - - DGEN_10_mean = mean(DGEN_past_10_mean, DGEN_fut_10_mean) - - Domain-based: - - DGEN_pref_mean = mean across all 4 Pref scores - - DGEN_pers_mean = mean across all 4 Pers scores - - NA values excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 11: dataP 11 - CORRECT ehi vars.r -================================================================================ - -PURPOSE: - Creates Enduring Hedonic Impact (EHI) variables by calculating differences - between Past and Future responses for each item across different time intervals. - Formula: NPast - NFut (positive values indicate greater past-present change) - -VARIABLES CREATED: 45 total (15 items × 3 time intervals) - -SOURCE COLUMNS: - 5-year intervals: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - 10-year intervals: - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5-10 year change: - - X5.10past_pref_read through X5.10past_val_justice (15 columns) - - X5.10fut_pref_read through X5.10fut_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year EHI Variables (15 variables): - - ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, ehi5_pref_nap, - ehi5_pref_travel - - ehi5_pers_extravert, ehi5_pers_critical, ehi5_pers_dependable, - ehi5_pers_anxious, ehi5_pers_complex - - ehi5_val_obey, ehi5_val_trad, ehi5_val_opinion, ehi5_val_performance, - ehi5_val_justice - - 10-Year EHI Variables (15 variables): - - ehi10_pref_read, ehi10_pref_music, ehi10_pref_TV, ehi10_pref_nap, - ehi10_pref_travel - - ehi10_pers_extravert, ehi10_pers_critical, ehi10_pers_dependable, - ehi10_pers_anxious, ehi10_pers_complex - - ehi10_val_obey, ehi10_val_trad, ehi10_val_opinion, ehi10_val_performance, - ehi10_val_justice - - 5-10 Year Change EHI Variables (15 variables): - - ehi5.10_pref_read, ehi5.10_pref_music, ehi5.10_pref_TV, ehi5.10_pref_nap, - ehi5.10_pref_travel - - ehi5.10_pers_extravert, ehi5.10_pers_critical, ehi5.10_pers_dependable, - ehi5.10_pers_anxious, ehi5.10_pers_complex - - ehi5.10_val_obey, ehi5.10_val_trad, ehi5.10_val_opinion, - ehi5.10_val_performance, ehi5.10_val_justice - -TRANSFORMATION LOGIC: - Formula: NPast - NFut - - All calculations use signed differences: - - ehi5_[item] = NPast_5_[item] - NFut_5_[item] - - ehi10_[item] = NPast_10_[item] - NFut_10_[item] - - ehi5.10_[item] = X5.10past_[item] - X5.10fut_[item] - - Result: Positive = greater past change, Negative = greater future change - Missing values in either source column result in NA - -QUALITY ASSURANCE: - - Comprehensive QA checks for all 45 variables across all rows - - First 5 rows displayed with detailed calculations showing source values, - computed differences, and stored values - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 12: dataP 12 - CORRECT DGEN ehi vars.r -================================================================================ - -PURPOSE: - Creates domain-general EHI variables by calculating differences between Past - and Future DGEN responses. These are the domain-general parallel to Script 11's - domain-specific EHI variables. - -VARIABLES CREATED: 6 total (3 domains × 2 time intervals) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - 5-Year DGEN EHI (3 variables): - - ehiDGEN_5_Pref - - ehiDGEN_5_Pers - - ehiDGEN_5_Val - - 10-Year DGEN EHI (3 variables): - - ehiDGEN_10_Pref - - ehiDGEN_10_Pers - - ehiDGEN_10_Val - -TRANSFORMATION LOGIC: - Formula: DGEN_past - DGEN_fut - - All calculations use signed differences: - - ehiDGEN_5_Pref = DGEN_past_5_Pref - DGEN_fut_5_Pref - - ehiDGEN_5_Pers = DGEN_past_5_Pers - DGEN_fut_5_Pers - - ehiDGEN_5_Val = DGEN_past_5_Val - DGEN_fut_5_Val - - ehiDGEN_10_Pref = DGEN_past_10_Pref - DGEN_fut_10_Pref - - ehiDGEN_10_Pers = DGEN_past_10_Pers - DGEN_fut_10_Pers - - ehiDGEN_10_Val = DGEN_past_10_Val - DGEN_fut_10_Val - - Result: Positive = greater past change, Negative = greater future change - -QUALITY ASSURANCE: - - QA checks for all 6 variables across all rows - - First 5 rows displayed with detailed calculations - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 13: datap 13 - ehi domain specific means.r -================================================================================ - -PURPOSE: - Calculates domain-level mean EHI scores by averaging the 5 items within each - domain (Preferences, Personality, Values) for each time interval. - -VARIABLES CREATED: 9 total (3 domains × 3 time intervals) - -SOURCE COLUMNS: - - ehi5_pref_read through ehi5_val_justice (15 columns) - - ehi10_pref_read through ehi10_val_justice (15 columns) - - ehi5.10_pref_read through ehi5.10_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year Domain Means (3 variables): - - ehi5_pref_MEAN (mean of 5 preference items) - - ehi5_pers_MEAN (mean of 5 personality items) - - ehi5_val_MEAN (mean of 5 values items) - - 10-Year Domain Means (3 variables): - - ehi10_pref_MEAN - - ehi10_pers_MEAN - - ehi10_val_MEAN - - 5-10 Year Change Domain Means (3 variables): - - ehi5.10_pref_MEAN - - ehi5.10_pers_MEAN - - ehi5.10_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for ehi5_pref_MEAN: - = mean(ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, - ehi5_pref_nap, ehi5_pref_travel) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - Comprehensive QA for all 9 variables across all rows - - First 5 rows displayed for multiple domain means - - Pass/Fail status for each variable - - -================================================================================ -SCRIPT 14: datap 14 - all ehi global means.r -================================================================================ - -PURPOSE: - Calculates global EHI means by averaging domain-level means. Creates the - highest-level summary scores for EHI across both domain-general and - domain-specific measures. - -VARIABLES CREATED: 5 total - -SOURCE COLUMNS: - - ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val - - ehiDGEN_10_Pref, ehiDGEN_10_Pers, ehiDGEN_10_Val - - ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN - - ehi10_pref_MEAN, ehi10_pers_MEAN, ehi10_val_MEAN - - ehi5.10_pref_MEAN, ehi5.10_pers_MEAN, ehi5.10_val_MEAN - -TARGET VARIABLES: - DGEN Global Means (2 variables): - - ehiDGEN_5_mean (mean of 3 DGEN domains for 5-year) - - ehiDGEN_10_mean (mean of 3 DGEN domains for 10-year) - - Domain-Specific Global Means (3 variables): - - ehi5_global_mean (mean of 3 domain means for 5-year) - - ehi10_global_mean (mean of 3 domain means for 10-year) - - ehi5.10_global_mean (mean of 3 domain means for 5-10 change) - -TRANSFORMATION LOGIC: - Each global mean = average of 3 domain-level scores - - Example for ehiDGEN_5_mean: - = mean(ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val) - - Example for ehi5_global_mean: - = mean(ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - QA for all 5 global means across all rows - - First 5 rows displayed with detailed calculations - - Values shown with 5 decimal precision - - Pass/Fail status for each variable - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 285 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN time period means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - - Script 08: 6 variables (DGEN 5-vs-10 differences) - - Script 09: 11 variables (interval × direction means) - - Script 10: 6 variables (DGEN combined means) - - Script 11: 45 variables (domain-specific EHI scores) - - Script 12: 6 variables (DGEN EHI scores) - - Script 13: 9 variables (EHI domain means) - - Script 14: 5 variables (EHI global means) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (28 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Time period means: 4 (1 per time period) - * 5-vs-10 differences: 6 (3 domains × 2 directions) - * Combined means: 6 (past, future, interval-based, domain-based) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - - Interval × Direction Means (11 total): - * Narrow-scope means: 6 (NPast_5, NPast_10, NFut_5, NFut_10, - X5.10past, X5.10fut) - * Global-scope means: 5 (NPast_global, NFut_global, X5.10_global, - N5_global, N10_global) - - - EHI Variables (60 total): - * Domain-specific EHI: 45 (15 items × 3 time intervals) - * DGEN EHI: 6 (3 domains × 2 time intervals) - * Domain means: 9 (3 domains × 3 time intervals) - * Global means: 5 (2 DGEN + 3 domain-specific) - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 14) as later scripts depend - on variables created by earlier scripts. - - Key Dependencies: - - Script 03 required before Script 04, 08, 10, 12 (DGEN scores) - - Script 04 required before Script 10 (DGEN time period means) - - Script 06 required before Script 07, 09, 11 (time interval differences) - - Script 11 required before Script 13 (domain-specific EHI items) - - Script 12 required before Script 14 (DGEN EHI scores) - - Script 13 required before Script 14 (EHI domain means) - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with random row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - - Script 09 includes comprehensive QA with first 5 rows for all 11 variables - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Scripts 08 and 09 save directly to eohi2.csv - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - "X" prefix added to variables starting with numbers (X5.10past_mean) - - Global means use "_global_" to distinguish from narrow-scope means - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 8, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251008171528.txt b/.history/eohi2/README_Variable_Creation_20251008171528.txt deleted file mode 100644 index 1655c07..0000000 --- a/.history/eohi2/README_Variable_Creation_20251008171528.txt +++ /dev/null @@ -1,936 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 09 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SCRIPT 08: dataP 08 - DGEN 510 vars.r -================================================================================ - -PURPOSE: - Calculates absolute differences between 5-year and 10-year DGEN ratings for - both Past and Future time directions. These variables measure the perceived - difference in domain-general change between the two time intervals. - -VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - Total: 12 DGEN columns (created in Script 03) - -TARGET VARIABLES: - Past Direction (3 variables): - - X5_10DGEN_past_pref (|DGEN_past_5_Pref - DGEN_past_10_Pref|) - - X5_10DGEN_past_pers (|DGEN_past_5_Pers - DGEN_past_10_Pers|) - - X5_10DGEN_past_val (|DGEN_past_5_Val - DGEN_past_10_Val|) - - Future Direction (3 variables): - - X5_10DGEN_fut_pref (|DGEN_fut_5_Pref - DGEN_fut_10_Pref|) - - X5_10DGEN_fut_pers (|DGEN_fut_5_Pers - DGEN_fut_10_Pers|) - - X5_10DGEN_fut_val (|DGEN_fut_5_Val - DGEN_fut_10_Val|) - -TRANSFORMATION LOGIC: - Formula: |DGEN_5 - DGEN_10| - - All calculations use absolute differences: - - Past Preferences: |DGEN_past_5_Pref - DGEN_past_10_Pref| - - Past Personality: |DGEN_past_5_Pers - DGEN_past_10_Pers| - - Past Values: |DGEN_past_5_Val - DGEN_past_10_Val| - - Future Preferences: |DGEN_fut_5_Pref - DGEN_fut_10_Pref| - - Future Personality: |DGEN_fut_5_Pers - DGEN_fut_10_Pers| - - Future Values: |DGEN_fut_5_Val - DGEN_fut_10_Val| - - Result: Always positive values representing magnitude of difference - Missing values in either source column result in NA - -SPECIAL NOTES: - - Variable names use "X" prefix because R automatically adds it to column - names starting with numbers (5_10 becomes X5_10) - - This script depends on Script 03 being run first - - Measures interval effects within time direction (past vs future) - - Parallel to Script 06's 5.10past and 5.10fut variables but for DGEN scores - - -================================================================================ -SCRIPT 09: dataP 09 - interval x direction means.r -================================================================================ - -PURPOSE: - Calculates comprehensive mean scores by averaging item-level differences - across intervals and directions. Creates both narrow-scope means (single - time interval) and broad-scope global means (combining multiple intervals). - -VARIABLES CREATED: 11 total (6 narrow-scope + 5 global-scope) - -SOURCE COLUMNS: - All 90 difference variables created in Script 06: - - NPast_5_[domain]_[item] (15 variables) - - NPast_10_[domain]_[item] (15 variables) - - NFut_5_[domain]_[item] (15 variables) - - NFut_10_[domain]_[item] (15 variables) - - X5.10past_[domain]_[item] (15 variables) - - X5.10fut_[domain]_[item] (15 variables) - -TARGET VARIABLES: - - Narrow-Scope Means (15 source items each): - - NPast_5_mean (mean across all 15 NPast_5 items) - - NPast_10_mean (mean across all 15 NPast_10 items) - - NFut_5_mean (mean across all 15 NFut_5 items) - - NFut_10_mean (mean across all 15 NFut_10 items) - - X5.10past_mean (mean across all 15 X5.10past items) - - X5.10fut_mean (mean across all 15 X5.10fut items) - - Global-Scope Means (30 source items each): - - NPast_global_mean (NPast_5 + NPast_10: all past intervals) - - NFut_global_mean (NFut_5 + NFut_10: all future intervals) - - X5.10_global_mean (X5.10past + X5.10fut: all 5-vs-10 intervals) - - N5_global_mean (NPast_5 + NFut_5: all 5-year intervals) - - N10_global_mean (NPast_10 + NFut_10: all 10-year intervals) - -TRANSFORMATION LOGIC: - - Narrow-Scope Means (15 items each): - Each mean averages all 15 difference items within one time interval - - Example for NPast_5_mean: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel, - NPast_5_pers_extravert, NPast_5_pers_critical, - NPast_5_pers_dependable, NPast_5_pers_anxious, - NPast_5_pers_complex, - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice) - - Global-Scope Means (30 items each): - Each mean averages 30 difference items across two related intervals - - Example for NPast_global_mean: - = mean(all 15 NPast_5 items + all 15 NPast_10 items) - Represents overall perceived change from present to any past timepoint - - Example for N5_global_mean: - = mean(all 15 NPast_5 items + all 15 NFut_5 items) - Represents overall perceived change at 5-year interval regardless of - direction - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF INTERVAL × DIRECTION MEANS: - - Narrow-scope means: Single-interval summaries across all domains and items - - Global-scope means: Cross-interval summaries for testing: - * Direction effects (past vs future) - * Interval effects (5-year vs 10-year) - * Combined temporal distance effects - - Enables comprehensive analysis of temporal self-perception patterns - - Reduces item-level and domain-level noise through broad aggregation - -QUALITY ASSURANCE: - - Script includes automated QA checks for first 5 rows - - Manually recalculates each mean and verifies against stored values - - Prints TRUE/FALSE match status for each variable - - Ensures calculation accuracy before further analysis - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - All means are averages of absolute difference scores (non-negative) - - Global means provide the broadest temporal self-perception summaries - - Naming convention uses "global" for 30-item means, no suffix for 15-item - - -================================================================================ -SCRIPT 10: dataP 10 - DGEN mean vars.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging across different time combinations. - Creates means for Past, Future, and interval-based (5-year, 10-year) groupings. - -VARIABLES CREATED: 6 total - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - Direction-Based Means (2 variables): - - DGEN_past_mean (mean of past_5_mean and past_10_mean) - - DGEN_fut_mean (mean of fut_5_mean and fut_10_mean) - - Interval-Based Means (2 variables): - - DGEN_5_mean (mean of past_5_mean and fut_5_mean) - - DGEN_10_mean (mean of past_10_mean and fut_10_mean) - - Domain-Based Means (2 variables): - - DGEN_pref_mean (mean across all 4 time periods for Preferences) - - DGEN_pers_mean (mean across all 4 time periods for Personality) - -TRANSFORMATION LOGIC: - Direction-based: - - DGEN_past_mean = mean(DGEN_past_5_mean, DGEN_past_10_mean) - - DGEN_fut_mean = mean(DGEN_fut_5_mean, DGEN_fut_10_mean) - - Interval-based: - - DGEN_5_mean = mean(DGEN_past_5_mean, DGEN_fut_5_mean) - - DGEN_10_mean = mean(DGEN_past_10_mean, DGEN_fut_10_mean) - - Domain-based: - - DGEN_pref_mean = mean across all 4 Pref scores - - DGEN_pers_mean = mean across all 4 Pers scores - - NA values excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 11: dataP 11 - CORRECT ehi vars.r -================================================================================ - -PURPOSE: - Creates Enduring Hedonic Impact (EHI) variables by calculating differences - between Past and Future responses for each item across different time intervals. - Formula: NPast - NFut (positive values indicate greater past-present change) - -VARIABLES CREATED: 45 total (15 items × 3 time intervals) - -SOURCE COLUMNS: - 5-year intervals: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - 10-year intervals: - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5-10 year change: - - X5.10past_pref_read through X5.10past_val_justice (15 columns) - - X5.10fut_pref_read through X5.10fut_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year EHI Variables (15 variables): - - ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, ehi5_pref_nap, - ehi5_pref_travel - - ehi5_pers_extravert, ehi5_pers_critical, ehi5_pers_dependable, - ehi5_pers_anxious, ehi5_pers_complex - - ehi5_val_obey, ehi5_val_trad, ehi5_val_opinion, ehi5_val_performance, - ehi5_val_justice - - 10-Year EHI Variables (15 variables): - - ehi10_pref_read, ehi10_pref_music, ehi10_pref_TV, ehi10_pref_nap, - ehi10_pref_travel - - ehi10_pers_extravert, ehi10_pers_critical, ehi10_pers_dependable, - ehi10_pers_anxious, ehi10_pers_complex - - ehi10_val_obey, ehi10_val_trad, ehi10_val_opinion, ehi10_val_performance, - ehi10_val_justice - - 5-10 Year Change EHI Variables (15 variables): - - ehi5.10_pref_read, ehi5.10_pref_music, ehi5.10_pref_TV, ehi5.10_pref_nap, - ehi5.10_pref_travel - - ehi5.10_pers_extravert, ehi5.10_pers_critical, ehi5.10_pers_dependable, - ehi5.10_pers_anxious, ehi5.10_pers_complex - - ehi5.10_val_obey, ehi5.10_val_trad, ehi5.10_val_opinion, - ehi5.10_val_performance, ehi5.10_val_justice - -TRANSFORMATION LOGIC: - Formula: NPast - NFut - - All calculations use signed differences: - - ehi5_[item] = NPast_5_[item] - NFut_5_[item] - - ehi10_[item] = NPast_10_[item] - NFut_10_[item] - - ehi5.10_[item] = X5.10past_[item] - X5.10fut_[item] - - Result: Positive = greater past change, Negative = greater future change - Missing values in either source column result in NA - -QUALITY ASSURANCE: - - Comprehensive QA checks for all 45 variables across all rows - - First 5 rows displayed with detailed calculations showing source values, - computed differences, and stored values - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 12: dataP 12 - CORRECT DGEN ehi vars.r -================================================================================ - -PURPOSE: - Creates domain-general EHI variables by calculating differences between Past - and Future DGEN responses. These are the domain-general parallel to Script 11's - domain-specific EHI variables. - -VARIABLES CREATED: 6 total (3 domains × 2 time intervals) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - 5-Year DGEN EHI (3 variables): - - ehiDGEN_5_Pref - - ehiDGEN_5_Pers - - ehiDGEN_5_Val - - 10-Year DGEN EHI (3 variables): - - ehiDGEN_10_Pref - - ehiDGEN_10_Pers - - ehiDGEN_10_Val - -TRANSFORMATION LOGIC: - Formula: DGEN_past - DGEN_fut - - All calculations use signed differences: - - ehiDGEN_5_Pref = DGEN_past_5_Pref - DGEN_fut_5_Pref - - ehiDGEN_5_Pers = DGEN_past_5_Pers - DGEN_fut_5_Pers - - ehiDGEN_5_Val = DGEN_past_5_Val - DGEN_fut_5_Val - - ehiDGEN_10_Pref = DGEN_past_10_Pref - DGEN_fut_10_Pref - - ehiDGEN_10_Pers = DGEN_past_10_Pers - DGEN_fut_10_Pers - - ehiDGEN_10_Val = DGEN_past_10_Val - DGEN_fut_10_Val - - Result: Positive = greater past change, Negative = greater future change - -QUALITY ASSURANCE: - - QA checks for all 6 variables across all rows - - First 5 rows displayed with detailed calculations - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 13: datap 13 - ehi domain specific means.r -================================================================================ - -PURPOSE: - Calculates domain-level mean EHI scores by averaging the 5 items within each - domain (Preferences, Personality, Values) for each time interval. - -VARIABLES CREATED: 9 total (3 domains × 3 time intervals) - -SOURCE COLUMNS: - - ehi5_pref_read through ehi5_val_justice (15 columns) - - ehi10_pref_read through ehi10_val_justice (15 columns) - - ehi5.10_pref_read through ehi5.10_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year Domain Means (3 variables): - - ehi5_pref_MEAN (mean of 5 preference items) - - ehi5_pers_MEAN (mean of 5 personality items) - - ehi5_val_MEAN (mean of 5 values items) - - 10-Year Domain Means (3 variables): - - ehi10_pref_MEAN - - ehi10_pers_MEAN - - ehi10_val_MEAN - - 5-10 Year Change Domain Means (3 variables): - - ehi5.10_pref_MEAN - - ehi5.10_pers_MEAN - - ehi5.10_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for ehi5_pref_MEAN: - = mean(ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, - ehi5_pref_nap, ehi5_pref_travel) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - Comprehensive QA for all 9 variables across all rows - - First 5 rows displayed for multiple domain means - - Pass/Fail status for each variable - - -================================================================================ -SCRIPT 14: datap 14 - all ehi global means.r -================================================================================ - -PURPOSE: - Calculates global EHI means by averaging domain-level means. Creates the - highest-level summary scores for EHI across both domain-general and - domain-specific measures. - -VARIABLES CREATED: 5 total - -SOURCE COLUMNS: - - ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val - - ehiDGEN_10_Pref, ehiDGEN_10_Pers, ehiDGEN_10_Val - - ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN - - ehi10_pref_MEAN, ehi10_pers_MEAN, ehi10_val_MEAN - - ehi5.10_pref_MEAN, ehi5.10_pers_MEAN, ehi5.10_val_MEAN - -TARGET VARIABLES: - DGEN Global Means (2 variables): - - ehiDGEN_5_mean (mean of 3 DGEN domains for 5-year) - - ehiDGEN_10_mean (mean of 3 DGEN domains for 10-year) - - Domain-Specific Global Means (3 variables): - - ehi5_global_mean (mean of 3 domain means for 5-year) - - ehi10_global_mean (mean of 3 domain means for 10-year) - - ehi5.10_global_mean (mean of 3 domain means for 5-10 change) - -TRANSFORMATION LOGIC: - Each global mean = average of 3 domain-level scores - - Example for ehiDGEN_5_mean: - = mean(ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val) - - Example for ehi5_global_mean: - = mean(ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - QA for all 5 global means across all rows - - First 5 rows displayed with detailed calculations - - Values shown with 5 decimal precision - - Pass/Fail status for each variable - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 285 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN time period means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - - Script 08: 6 variables (DGEN 5-vs-10 differences) - - Script 09: 11 variables (interval × direction means) - - Script 10: 6 variables (DGEN combined means) - - Script 11: 45 variables (domain-specific EHI scores) - - Script 12: 6 variables (DGEN EHI scores) - - Script 13: 9 variables (EHI domain means) - - Script 14: 5 variables (EHI global means) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (28 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Time period means: 4 (1 per time period) - * 5-vs-10 differences: 6 (3 domains × 2 directions) - * Combined means: 6 (past, future, interval-based, domain-based) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - - Interval × Direction Means (11 total): - * Narrow-scope means: 6 (NPast_5, NPast_10, NFut_5, NFut_10, - X5.10past, X5.10fut) - * Global-scope means: 5 (NPast_global, NFut_global, X5.10_global, - N5_global, N10_global) - - - EHI Variables (60 total): - * Domain-specific EHI: 45 (15 items × 3 time intervals) - * DGEN EHI: 6 (3 domains × 2 time intervals) - * Domain means: 9 (3 domains × 3 time intervals) - * Global means: 5 (2 DGEN + 3 domain-specific) - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 14) as later scripts depend - on variables created by earlier scripts. - - Key Dependencies: - - Script 03 required before Script 04, 08, 10, 12 (DGEN scores) - - Script 04 required before Script 10 (DGEN time period means) - - Script 06 required before Script 07, 09, 11 (time interval differences) - - Script 11 required before Script 13 (domain-specific EHI items) - - Script 12 required before Script 14 (DGEN EHI scores) - - Script 13 required before Script 14 (EHI domain means) - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - - Scripts 09-14 include comprehensive QA with first 5 rows displayed - - All EHI scripts (11-14) verify calculations against stored values - - Pass/Fail status reported for all variables in QA-enabled scripts - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Scripts 08 and 09 save directly to eohi2.csv - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - "X" prefix added to variables starting with numbers (X5.10past_mean) - - Global means use "_global_" to distinguish from narrow-scope means - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 8, 2025 - diff --git a/.history/eohi2/README_Variable_Creation_20251008171541.txt b/.history/eohi2/README_Variable_Creation_20251008171541.txt deleted file mode 100644 index da6d9d1..0000000 --- a/.history/eohi2/README_Variable_Creation_20251008171541.txt +++ /dev/null @@ -1,971 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 09 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SCRIPT 08: dataP 08 - DGEN 510 vars.r -================================================================================ - -PURPOSE: - Calculates absolute differences between 5-year and 10-year DGEN ratings for - both Past and Future time directions. These variables measure the perceived - difference in domain-general change between the two time intervals. - -VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - Total: 12 DGEN columns (created in Script 03) - -TARGET VARIABLES: - Past Direction (3 variables): - - X5_10DGEN_past_pref (|DGEN_past_5_Pref - DGEN_past_10_Pref|) - - X5_10DGEN_past_pers (|DGEN_past_5_Pers - DGEN_past_10_Pers|) - - X5_10DGEN_past_val (|DGEN_past_5_Val - DGEN_past_10_Val|) - - Future Direction (3 variables): - - X5_10DGEN_fut_pref (|DGEN_fut_5_Pref - DGEN_fut_10_Pref|) - - X5_10DGEN_fut_pers (|DGEN_fut_5_Pers - DGEN_fut_10_Pers|) - - X5_10DGEN_fut_val (|DGEN_fut_5_Val - DGEN_fut_10_Val|) - -TRANSFORMATION LOGIC: - Formula: |DGEN_5 - DGEN_10| - - All calculations use absolute differences: - - Past Preferences: |DGEN_past_5_Pref - DGEN_past_10_Pref| - - Past Personality: |DGEN_past_5_Pers - DGEN_past_10_Pers| - - Past Values: |DGEN_past_5_Val - DGEN_past_10_Val| - - Future Preferences: |DGEN_fut_5_Pref - DGEN_fut_10_Pref| - - Future Personality: |DGEN_fut_5_Pers - DGEN_fut_10_Pers| - - Future Values: |DGEN_fut_5_Val - DGEN_fut_10_Val| - - Result: Always positive values representing magnitude of difference - Missing values in either source column result in NA - -SPECIAL NOTES: - - Variable names use "X" prefix because R automatically adds it to column - names starting with numbers (5_10 becomes X5_10) - - This script depends on Script 03 being run first - - Measures interval effects within time direction (past vs future) - - Parallel to Script 06's 5.10past and 5.10fut variables but for DGEN scores - - -================================================================================ -SCRIPT 09: dataP 09 - interval x direction means.r -================================================================================ - -PURPOSE: - Calculates comprehensive mean scores by averaging item-level differences - across intervals and directions. Creates both narrow-scope means (single - time interval) and broad-scope global means (combining multiple intervals). - -VARIABLES CREATED: 11 total (6 narrow-scope + 5 global-scope) - -SOURCE COLUMNS: - All 90 difference variables created in Script 06: - - NPast_5_[domain]_[item] (15 variables) - - NPast_10_[domain]_[item] (15 variables) - - NFut_5_[domain]_[item] (15 variables) - - NFut_10_[domain]_[item] (15 variables) - - X5.10past_[domain]_[item] (15 variables) - - X5.10fut_[domain]_[item] (15 variables) - -TARGET VARIABLES: - - Narrow-Scope Means (15 source items each): - - NPast_5_mean (mean across all 15 NPast_5 items) - - NPast_10_mean (mean across all 15 NPast_10 items) - - NFut_5_mean (mean across all 15 NFut_5 items) - - NFut_10_mean (mean across all 15 NFut_10 items) - - X5.10past_mean (mean across all 15 X5.10past items) - - X5.10fut_mean (mean across all 15 X5.10fut items) - - Global-Scope Means (30 source items each): - - NPast_global_mean (NPast_5 + NPast_10: all past intervals) - - NFut_global_mean (NFut_5 + NFut_10: all future intervals) - - X5.10_global_mean (X5.10past + X5.10fut: all 5-vs-10 intervals) - - N5_global_mean (NPast_5 + NFut_5: all 5-year intervals) - - N10_global_mean (NPast_10 + NFut_10: all 10-year intervals) - -TRANSFORMATION LOGIC: - - Narrow-Scope Means (15 items each): - Each mean averages all 15 difference items within one time interval - - Example for NPast_5_mean: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel, - NPast_5_pers_extravert, NPast_5_pers_critical, - NPast_5_pers_dependable, NPast_5_pers_anxious, - NPast_5_pers_complex, - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice) - - Global-Scope Means (30 items each): - Each mean averages 30 difference items across two related intervals - - Example for NPast_global_mean: - = mean(all 15 NPast_5 items + all 15 NPast_10 items) - Represents overall perceived change from present to any past timepoint - - Example for N5_global_mean: - = mean(all 15 NPast_5 items + all 15 NFut_5 items) - Represents overall perceived change at 5-year interval regardless of - direction - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF INTERVAL × DIRECTION MEANS: - - Narrow-scope means: Single-interval summaries across all domains and items - - Global-scope means: Cross-interval summaries for testing: - * Direction effects (past vs future) - * Interval effects (5-year vs 10-year) - * Combined temporal distance effects - - Enables comprehensive analysis of temporal self-perception patterns - - Reduces item-level and domain-level noise through broad aggregation - -QUALITY ASSURANCE: - - Script includes automated QA checks for first 5 rows - - Manually recalculates each mean and verifies against stored values - - Prints TRUE/FALSE match status for each variable - - Ensures calculation accuracy before further analysis - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - All means are averages of absolute difference scores (non-negative) - - Global means provide the broadest temporal self-perception summaries - - Naming convention uses "global" for 30-item means, no suffix for 15-item - - -================================================================================ -SCRIPT 10: dataP 10 - DGEN mean vars.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging across different time combinations. - Creates means for Past, Future, and interval-based (5-year, 10-year) groupings. - -VARIABLES CREATED: 6 total - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - Direction-Based Means (2 variables): - - DGEN_past_mean (mean of past_5_mean and past_10_mean) - - DGEN_fut_mean (mean of fut_5_mean and fut_10_mean) - - Interval-Based Means (2 variables): - - DGEN_5_mean (mean of past_5_mean and fut_5_mean) - - DGEN_10_mean (mean of past_10_mean and fut_10_mean) - - Domain-Based Means (2 variables): - - DGEN_pref_mean (mean across all 4 time periods for Preferences) - - DGEN_pers_mean (mean across all 4 time periods for Personality) - -TRANSFORMATION LOGIC: - Direction-based: - - DGEN_past_mean = mean(DGEN_past_5_mean, DGEN_past_10_mean) - - DGEN_fut_mean = mean(DGEN_fut_5_mean, DGEN_fut_10_mean) - - Interval-based: - - DGEN_5_mean = mean(DGEN_past_5_mean, DGEN_fut_5_mean) - - DGEN_10_mean = mean(DGEN_past_10_mean, DGEN_fut_10_mean) - - Domain-based: - - DGEN_pref_mean = mean across all 4 Pref scores - - DGEN_pers_mean = mean across all 4 Pers scores - - NA values excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 11: dataP 11 - CORRECT ehi vars.r -================================================================================ - -PURPOSE: - Creates Enduring Hedonic Impact (EHI) variables by calculating differences - between Past and Future responses for each item across different time intervals. - Formula: NPast - NFut (positive values indicate greater past-present change) - -VARIABLES CREATED: 45 total (15 items × 3 time intervals) - -SOURCE COLUMNS: - 5-year intervals: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - 10-year intervals: - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5-10 year change: - - X5.10past_pref_read through X5.10past_val_justice (15 columns) - - X5.10fut_pref_read through X5.10fut_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year EHI Variables (15 variables): - - ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, ehi5_pref_nap, - ehi5_pref_travel - - ehi5_pers_extravert, ehi5_pers_critical, ehi5_pers_dependable, - ehi5_pers_anxious, ehi5_pers_complex - - ehi5_val_obey, ehi5_val_trad, ehi5_val_opinion, ehi5_val_performance, - ehi5_val_justice - - 10-Year EHI Variables (15 variables): - - ehi10_pref_read, ehi10_pref_music, ehi10_pref_TV, ehi10_pref_nap, - ehi10_pref_travel - - ehi10_pers_extravert, ehi10_pers_critical, ehi10_pers_dependable, - ehi10_pers_anxious, ehi10_pers_complex - - ehi10_val_obey, ehi10_val_trad, ehi10_val_opinion, ehi10_val_performance, - ehi10_val_justice - - 5-10 Year Change EHI Variables (15 variables): - - ehi5.10_pref_read, ehi5.10_pref_music, ehi5.10_pref_TV, ehi5.10_pref_nap, - ehi5.10_pref_travel - - ehi5.10_pers_extravert, ehi5.10_pers_critical, ehi5.10_pers_dependable, - ehi5.10_pers_anxious, ehi5.10_pers_complex - - ehi5.10_val_obey, ehi5.10_val_trad, ehi5.10_val_opinion, - ehi5.10_val_performance, ehi5.10_val_justice - -TRANSFORMATION LOGIC: - Formula: NPast - NFut - - All calculations use signed differences: - - ehi5_[item] = NPast_5_[item] - NFut_5_[item] - - ehi10_[item] = NPast_10_[item] - NFut_10_[item] - - ehi5.10_[item] = X5.10past_[item] - X5.10fut_[item] - - Result: Positive = greater past change, Negative = greater future change - Missing values in either source column result in NA - -QUALITY ASSURANCE: - - Comprehensive QA checks for all 45 variables across all rows - - First 5 rows displayed with detailed calculations showing source values, - computed differences, and stored values - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 12: dataP 12 - CORRECT DGEN ehi vars.r -================================================================================ - -PURPOSE: - Creates domain-general EHI variables by calculating differences between Past - and Future DGEN responses. These are the domain-general parallel to Script 11's - domain-specific EHI variables. - -VARIABLES CREATED: 6 total (3 domains × 2 time intervals) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - 5-Year DGEN EHI (3 variables): - - ehiDGEN_5_Pref - - ehiDGEN_5_Pers - - ehiDGEN_5_Val - - 10-Year DGEN EHI (3 variables): - - ehiDGEN_10_Pref - - ehiDGEN_10_Pers - - ehiDGEN_10_Val - -TRANSFORMATION LOGIC: - Formula: DGEN_past - DGEN_fut - - All calculations use signed differences: - - ehiDGEN_5_Pref = DGEN_past_5_Pref - DGEN_fut_5_Pref - - ehiDGEN_5_Pers = DGEN_past_5_Pers - DGEN_fut_5_Pers - - ehiDGEN_5_Val = DGEN_past_5_Val - DGEN_fut_5_Val - - ehiDGEN_10_Pref = DGEN_past_10_Pref - DGEN_fut_10_Pref - - ehiDGEN_10_Pers = DGEN_past_10_Pers - DGEN_fut_10_Pers - - ehiDGEN_10_Val = DGEN_past_10_Val - DGEN_fut_10_Val - - Result: Positive = greater past change, Negative = greater future change - -QUALITY ASSURANCE: - - QA checks for all 6 variables across all rows - - First 5 rows displayed with detailed calculations - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 13: datap 13 - ehi domain specific means.r -================================================================================ - -PURPOSE: - Calculates domain-level mean EHI scores by averaging the 5 items within each - domain (Preferences, Personality, Values) for each time interval. - -VARIABLES CREATED: 9 total (3 domains × 3 time intervals) - -SOURCE COLUMNS: - - ehi5_pref_read through ehi5_val_justice (15 columns) - - ehi10_pref_read through ehi10_val_justice (15 columns) - - ehi5.10_pref_read through ehi5.10_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year Domain Means (3 variables): - - ehi5_pref_MEAN (mean of 5 preference items) - - ehi5_pers_MEAN (mean of 5 personality items) - - ehi5_val_MEAN (mean of 5 values items) - - 10-Year Domain Means (3 variables): - - ehi10_pref_MEAN - - ehi10_pers_MEAN - - ehi10_val_MEAN - - 5-10 Year Change Domain Means (3 variables): - - ehi5.10_pref_MEAN - - ehi5.10_pers_MEAN - - ehi5.10_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for ehi5_pref_MEAN: - = mean(ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, - ehi5_pref_nap, ehi5_pref_travel) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - Comprehensive QA for all 9 variables across all rows - - First 5 rows displayed for multiple domain means - - Pass/Fail status for each variable - - -================================================================================ -SCRIPT 14: datap 14 - all ehi global means.r -================================================================================ - -PURPOSE: - Calculates global EHI means by averaging domain-level means. Creates the - highest-level summary scores for EHI across both domain-general and - domain-specific measures. - -VARIABLES CREATED: 5 total - -SOURCE COLUMNS: - - ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val - - ehiDGEN_10_Pref, ehiDGEN_10_Pers, ehiDGEN_10_Val - - ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN - - ehi10_pref_MEAN, ehi10_pers_MEAN, ehi10_val_MEAN - - ehi5.10_pref_MEAN, ehi5.10_pers_MEAN, ehi5.10_val_MEAN - -TARGET VARIABLES: - DGEN Global Means (2 variables): - - ehiDGEN_5_mean (mean of 3 DGEN domains for 5-year) - - ehiDGEN_10_mean (mean of 3 DGEN domains for 10-year) - - Domain-Specific Global Means (3 variables): - - ehi5_global_mean (mean of 3 domain means for 5-year) - - ehi10_global_mean (mean of 3 domain means for 10-year) - - ehi5.10_global_mean (mean of 3 domain means for 5-10 change) - -TRANSFORMATION LOGIC: - Each global mean = average of 3 domain-level scores - - Example for ehiDGEN_5_mean: - = mean(ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val) - - Example for ehi5_global_mean: - = mean(ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - QA for all 5 global means across all rows - - First 5 rows displayed with detailed calculations - - Values shown with 5 decimal precision - - Pass/Fail status for each variable - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 285 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN time period means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - - Script 08: 6 variables (DGEN 5-vs-10 differences) - - Script 09: 11 variables (interval × direction means) - - Script 10: 6 variables (DGEN combined means) - - Script 11: 45 variables (domain-specific EHI scores) - - Script 12: 6 variables (DGEN EHI scores) - - Script 13: 9 variables (EHI domain means) - - Script 14: 5 variables (EHI global means) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (28 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Time period means: 4 (1 per time period) - * 5-vs-10 differences: 6 (3 domains × 2 directions) - * Combined means: 6 (past, future, interval-based, domain-based) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - - Interval × Direction Means (11 total): - * Narrow-scope means: 6 (NPast_5, NPast_10, NFut_5, NFut_10, - X5.10past, X5.10fut) - * Global-scope means: 5 (NPast_global, NFut_global, X5.10_global, - N5_global, N10_global) - - - EHI Variables (60 total): - * Domain-specific EHI: 45 (15 items × 3 time intervals) - * DGEN EHI: 6 (3 domains × 2 time intervals) - * Domain means: 9 (3 domains × 3 time intervals) - * Global means: 5 (2 DGEN + 3 domain-specific) - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 14) as later scripts depend - on variables created by earlier scripts. - - Key Dependencies: - - Script 03 required before Script 04, 08, 10, 12 (DGEN scores) - - Script 04 required before Script 10 (DGEN time period means) - - Script 06 required before Script 07, 09, 11 (time interval differences) - - Script 11 required before Script 13 (domain-specific EHI items) - - Script 12 required before Script 14 (DGEN EHI scores) - - Script 13 required before Script 14 (EHI domain means) - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - - Scripts 09-14 include comprehensive QA with first 5 rows displayed - - All EHI scripts (11-14) verify calculations against stored values - - Pass/Fail status reported for all variables in QA-enabled scripts - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Scripts 08 and 09 save directly to eohi2.csv - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - "X" prefix added to variables starting with numbers (X5.10past_mean) - - Global means use "_global_" to distinguish from narrow-scope means - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -EHI CONCEPT AND INTERPRETATION -================================================================================ - -ENDURING HEDONIC IMPACT (EHI): - EHI measures the asymmetry between perceived past and future change in - psychological attributes. The concept is based on the premise that people - may perceive their past and future selves differently, even when considering - equivalent time distances. - -KEY EHI VARIABLES: - - Domain-Specific EHI (Scripts 11, 13, 14): - Calculated from item-level differences between past and future responses - Formula: NPast - NFut - * Positive values: Greater perceived change from past to present - * Negative values: Greater perceived change from present to future - * Zero: Symmetric perception of past and future change - - - Domain-General EHI (Scripts 12, 14): - Calculated from DGEN single-item responses - Formula: DGEN_past - DGEN_fut - * Measures broader temporal self-perception without item-level detail - -HIERARCHICAL STRUCTURE: - Level 1: Item-level EHI (45 domain-specific, 6 DGEN) - Level 2: Domain means (9 domain-specific, combining 5 items each) - Level 3: Global means (5 highest-level summaries) - -INTERPRETATION: - - EHI > 0: "Past asymmetry" - Person perceives greater change from past - - EHI < 0: "Future asymmetry" - Person perceives greater change to future - - EHI ≈ 0: "Temporal symmetry" - Balanced perception of past/future change - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 8, 2025 -Processing Pipeline: Scripts 01-14 - diff --git a/.history/eohi2/README_Variable_Creation_20251008171604.txt b/.history/eohi2/README_Variable_Creation_20251008171604.txt deleted file mode 100644 index da6d9d1..0000000 --- a/.history/eohi2/README_Variable_Creation_20251008171604.txt +++ /dev/null @@ -1,971 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 09 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SCRIPT 08: dataP 08 - DGEN 510 vars.r -================================================================================ - -PURPOSE: - Calculates absolute differences between 5-year and 10-year DGEN ratings for - both Past and Future time directions. These variables measure the perceived - difference in domain-general change between the two time intervals. - -VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - Total: 12 DGEN columns (created in Script 03) - -TARGET VARIABLES: - Past Direction (3 variables): - - X5_10DGEN_past_pref (|DGEN_past_5_Pref - DGEN_past_10_Pref|) - - X5_10DGEN_past_pers (|DGEN_past_5_Pers - DGEN_past_10_Pers|) - - X5_10DGEN_past_val (|DGEN_past_5_Val - DGEN_past_10_Val|) - - Future Direction (3 variables): - - X5_10DGEN_fut_pref (|DGEN_fut_5_Pref - DGEN_fut_10_Pref|) - - X5_10DGEN_fut_pers (|DGEN_fut_5_Pers - DGEN_fut_10_Pers|) - - X5_10DGEN_fut_val (|DGEN_fut_5_Val - DGEN_fut_10_Val|) - -TRANSFORMATION LOGIC: - Formula: |DGEN_5 - DGEN_10| - - All calculations use absolute differences: - - Past Preferences: |DGEN_past_5_Pref - DGEN_past_10_Pref| - - Past Personality: |DGEN_past_5_Pers - DGEN_past_10_Pers| - - Past Values: |DGEN_past_5_Val - DGEN_past_10_Val| - - Future Preferences: |DGEN_fut_5_Pref - DGEN_fut_10_Pref| - - Future Personality: |DGEN_fut_5_Pers - DGEN_fut_10_Pers| - - Future Values: |DGEN_fut_5_Val - DGEN_fut_10_Val| - - Result: Always positive values representing magnitude of difference - Missing values in either source column result in NA - -SPECIAL NOTES: - - Variable names use "X" prefix because R automatically adds it to column - names starting with numbers (5_10 becomes X5_10) - - This script depends on Script 03 being run first - - Measures interval effects within time direction (past vs future) - - Parallel to Script 06's 5.10past and 5.10fut variables but for DGEN scores - - -================================================================================ -SCRIPT 09: dataP 09 - interval x direction means.r -================================================================================ - -PURPOSE: - Calculates comprehensive mean scores by averaging item-level differences - across intervals and directions. Creates both narrow-scope means (single - time interval) and broad-scope global means (combining multiple intervals). - -VARIABLES CREATED: 11 total (6 narrow-scope + 5 global-scope) - -SOURCE COLUMNS: - All 90 difference variables created in Script 06: - - NPast_5_[domain]_[item] (15 variables) - - NPast_10_[domain]_[item] (15 variables) - - NFut_5_[domain]_[item] (15 variables) - - NFut_10_[domain]_[item] (15 variables) - - X5.10past_[domain]_[item] (15 variables) - - X5.10fut_[domain]_[item] (15 variables) - -TARGET VARIABLES: - - Narrow-Scope Means (15 source items each): - - NPast_5_mean (mean across all 15 NPast_5 items) - - NPast_10_mean (mean across all 15 NPast_10 items) - - NFut_5_mean (mean across all 15 NFut_5 items) - - NFut_10_mean (mean across all 15 NFut_10 items) - - X5.10past_mean (mean across all 15 X5.10past items) - - X5.10fut_mean (mean across all 15 X5.10fut items) - - Global-Scope Means (30 source items each): - - NPast_global_mean (NPast_5 + NPast_10: all past intervals) - - NFut_global_mean (NFut_5 + NFut_10: all future intervals) - - X5.10_global_mean (X5.10past + X5.10fut: all 5-vs-10 intervals) - - N5_global_mean (NPast_5 + NFut_5: all 5-year intervals) - - N10_global_mean (NPast_10 + NFut_10: all 10-year intervals) - -TRANSFORMATION LOGIC: - - Narrow-Scope Means (15 items each): - Each mean averages all 15 difference items within one time interval - - Example for NPast_5_mean: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel, - NPast_5_pers_extravert, NPast_5_pers_critical, - NPast_5_pers_dependable, NPast_5_pers_anxious, - NPast_5_pers_complex, - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice) - - Global-Scope Means (30 items each): - Each mean averages 30 difference items across two related intervals - - Example for NPast_global_mean: - = mean(all 15 NPast_5 items + all 15 NPast_10 items) - Represents overall perceived change from present to any past timepoint - - Example for N5_global_mean: - = mean(all 15 NPast_5 items + all 15 NFut_5 items) - Represents overall perceived change at 5-year interval regardless of - direction - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF INTERVAL × DIRECTION MEANS: - - Narrow-scope means: Single-interval summaries across all domains and items - - Global-scope means: Cross-interval summaries for testing: - * Direction effects (past vs future) - * Interval effects (5-year vs 10-year) - * Combined temporal distance effects - - Enables comprehensive analysis of temporal self-perception patterns - - Reduces item-level and domain-level noise through broad aggregation - -QUALITY ASSURANCE: - - Script includes automated QA checks for first 5 rows - - Manually recalculates each mean and verifies against stored values - - Prints TRUE/FALSE match status for each variable - - Ensures calculation accuracy before further analysis - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - All means are averages of absolute difference scores (non-negative) - - Global means provide the broadest temporal self-perception summaries - - Naming convention uses "global" for 30-item means, no suffix for 15-item - - -================================================================================ -SCRIPT 10: dataP 10 - DGEN mean vars.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging across different time combinations. - Creates means for Past, Future, and interval-based (5-year, 10-year) groupings. - -VARIABLES CREATED: 6 total - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - Direction-Based Means (2 variables): - - DGEN_past_mean (mean of past_5_mean and past_10_mean) - - DGEN_fut_mean (mean of fut_5_mean and fut_10_mean) - - Interval-Based Means (2 variables): - - DGEN_5_mean (mean of past_5_mean and fut_5_mean) - - DGEN_10_mean (mean of past_10_mean and fut_10_mean) - - Domain-Based Means (2 variables): - - DGEN_pref_mean (mean across all 4 time periods for Preferences) - - DGEN_pers_mean (mean across all 4 time periods for Personality) - -TRANSFORMATION LOGIC: - Direction-based: - - DGEN_past_mean = mean(DGEN_past_5_mean, DGEN_past_10_mean) - - DGEN_fut_mean = mean(DGEN_fut_5_mean, DGEN_fut_10_mean) - - Interval-based: - - DGEN_5_mean = mean(DGEN_past_5_mean, DGEN_fut_5_mean) - - DGEN_10_mean = mean(DGEN_past_10_mean, DGEN_fut_10_mean) - - Domain-based: - - DGEN_pref_mean = mean across all 4 Pref scores - - DGEN_pers_mean = mean across all 4 Pers scores - - NA values excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 11: dataP 11 - CORRECT ehi vars.r -================================================================================ - -PURPOSE: - Creates Enduring Hedonic Impact (EHI) variables by calculating differences - between Past and Future responses for each item across different time intervals. - Formula: NPast - NFut (positive values indicate greater past-present change) - -VARIABLES CREATED: 45 total (15 items × 3 time intervals) - -SOURCE COLUMNS: - 5-year intervals: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - 10-year intervals: - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5-10 year change: - - X5.10past_pref_read through X5.10past_val_justice (15 columns) - - X5.10fut_pref_read through X5.10fut_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year EHI Variables (15 variables): - - ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, ehi5_pref_nap, - ehi5_pref_travel - - ehi5_pers_extravert, ehi5_pers_critical, ehi5_pers_dependable, - ehi5_pers_anxious, ehi5_pers_complex - - ehi5_val_obey, ehi5_val_trad, ehi5_val_opinion, ehi5_val_performance, - ehi5_val_justice - - 10-Year EHI Variables (15 variables): - - ehi10_pref_read, ehi10_pref_music, ehi10_pref_TV, ehi10_pref_nap, - ehi10_pref_travel - - ehi10_pers_extravert, ehi10_pers_critical, ehi10_pers_dependable, - ehi10_pers_anxious, ehi10_pers_complex - - ehi10_val_obey, ehi10_val_trad, ehi10_val_opinion, ehi10_val_performance, - ehi10_val_justice - - 5-10 Year Change EHI Variables (15 variables): - - ehi5.10_pref_read, ehi5.10_pref_music, ehi5.10_pref_TV, ehi5.10_pref_nap, - ehi5.10_pref_travel - - ehi5.10_pers_extravert, ehi5.10_pers_critical, ehi5.10_pers_dependable, - ehi5.10_pers_anxious, ehi5.10_pers_complex - - ehi5.10_val_obey, ehi5.10_val_trad, ehi5.10_val_opinion, - ehi5.10_val_performance, ehi5.10_val_justice - -TRANSFORMATION LOGIC: - Formula: NPast - NFut - - All calculations use signed differences: - - ehi5_[item] = NPast_5_[item] - NFut_5_[item] - - ehi10_[item] = NPast_10_[item] - NFut_10_[item] - - ehi5.10_[item] = X5.10past_[item] - X5.10fut_[item] - - Result: Positive = greater past change, Negative = greater future change - Missing values in either source column result in NA - -QUALITY ASSURANCE: - - Comprehensive QA checks for all 45 variables across all rows - - First 5 rows displayed with detailed calculations showing source values, - computed differences, and stored values - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 12: dataP 12 - CORRECT DGEN ehi vars.r -================================================================================ - -PURPOSE: - Creates domain-general EHI variables by calculating differences between Past - and Future DGEN responses. These are the domain-general parallel to Script 11's - domain-specific EHI variables. - -VARIABLES CREATED: 6 total (3 domains × 2 time intervals) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - 5-Year DGEN EHI (3 variables): - - ehiDGEN_5_Pref - - ehiDGEN_5_Pers - - ehiDGEN_5_Val - - 10-Year DGEN EHI (3 variables): - - ehiDGEN_10_Pref - - ehiDGEN_10_Pers - - ehiDGEN_10_Val - -TRANSFORMATION LOGIC: - Formula: DGEN_past - DGEN_fut - - All calculations use signed differences: - - ehiDGEN_5_Pref = DGEN_past_5_Pref - DGEN_fut_5_Pref - - ehiDGEN_5_Pers = DGEN_past_5_Pers - DGEN_fut_5_Pers - - ehiDGEN_5_Val = DGEN_past_5_Val - DGEN_fut_5_Val - - ehiDGEN_10_Pref = DGEN_past_10_Pref - DGEN_fut_10_Pref - - ehiDGEN_10_Pers = DGEN_past_10_Pers - DGEN_fut_10_Pers - - ehiDGEN_10_Val = DGEN_past_10_Val - DGEN_fut_10_Val - - Result: Positive = greater past change, Negative = greater future change - -QUALITY ASSURANCE: - - QA checks for all 6 variables across all rows - - First 5 rows displayed with detailed calculations - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 13: datap 13 - ehi domain specific means.r -================================================================================ - -PURPOSE: - Calculates domain-level mean EHI scores by averaging the 5 items within each - domain (Preferences, Personality, Values) for each time interval. - -VARIABLES CREATED: 9 total (3 domains × 3 time intervals) - -SOURCE COLUMNS: - - ehi5_pref_read through ehi5_val_justice (15 columns) - - ehi10_pref_read through ehi10_val_justice (15 columns) - - ehi5.10_pref_read through ehi5.10_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year Domain Means (3 variables): - - ehi5_pref_MEAN (mean of 5 preference items) - - ehi5_pers_MEAN (mean of 5 personality items) - - ehi5_val_MEAN (mean of 5 values items) - - 10-Year Domain Means (3 variables): - - ehi10_pref_MEAN - - ehi10_pers_MEAN - - ehi10_val_MEAN - - 5-10 Year Change Domain Means (3 variables): - - ehi5.10_pref_MEAN - - ehi5.10_pers_MEAN - - ehi5.10_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for ehi5_pref_MEAN: - = mean(ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, - ehi5_pref_nap, ehi5_pref_travel) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - Comprehensive QA for all 9 variables across all rows - - First 5 rows displayed for multiple domain means - - Pass/Fail status for each variable - - -================================================================================ -SCRIPT 14: datap 14 - all ehi global means.r -================================================================================ - -PURPOSE: - Calculates global EHI means by averaging domain-level means. Creates the - highest-level summary scores for EHI across both domain-general and - domain-specific measures. - -VARIABLES CREATED: 5 total - -SOURCE COLUMNS: - - ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val - - ehiDGEN_10_Pref, ehiDGEN_10_Pers, ehiDGEN_10_Val - - ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN - - ehi10_pref_MEAN, ehi10_pers_MEAN, ehi10_val_MEAN - - ehi5.10_pref_MEAN, ehi5.10_pers_MEAN, ehi5.10_val_MEAN - -TARGET VARIABLES: - DGEN Global Means (2 variables): - - ehiDGEN_5_mean (mean of 3 DGEN domains for 5-year) - - ehiDGEN_10_mean (mean of 3 DGEN domains for 10-year) - - Domain-Specific Global Means (3 variables): - - ehi5_global_mean (mean of 3 domain means for 5-year) - - ehi10_global_mean (mean of 3 domain means for 10-year) - - ehi5.10_global_mean (mean of 3 domain means for 5-10 change) - -TRANSFORMATION LOGIC: - Each global mean = average of 3 domain-level scores - - Example for ehiDGEN_5_mean: - = mean(ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val) - - Example for ehi5_global_mean: - = mean(ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - QA for all 5 global means across all rows - - First 5 rows displayed with detailed calculations - - Values shown with 5 decimal precision - - Pass/Fail status for each variable - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 285 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN time period means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - - Script 08: 6 variables (DGEN 5-vs-10 differences) - - Script 09: 11 variables (interval × direction means) - - Script 10: 6 variables (DGEN combined means) - - Script 11: 45 variables (domain-specific EHI scores) - - Script 12: 6 variables (DGEN EHI scores) - - Script 13: 9 variables (EHI domain means) - - Script 14: 5 variables (EHI global means) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (28 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Time period means: 4 (1 per time period) - * 5-vs-10 differences: 6 (3 domains × 2 directions) - * Combined means: 6 (past, future, interval-based, domain-based) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - - Interval × Direction Means (11 total): - * Narrow-scope means: 6 (NPast_5, NPast_10, NFut_5, NFut_10, - X5.10past, X5.10fut) - * Global-scope means: 5 (NPast_global, NFut_global, X5.10_global, - N5_global, N10_global) - - - EHI Variables (60 total): - * Domain-specific EHI: 45 (15 items × 3 time intervals) - * DGEN EHI: 6 (3 domains × 2 time intervals) - * Domain means: 9 (3 domains × 3 time intervals) - * Global means: 5 (2 DGEN + 3 domain-specific) - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 14) as later scripts depend - on variables created by earlier scripts. - - Key Dependencies: - - Script 03 required before Script 04, 08, 10, 12 (DGEN scores) - - Script 04 required before Script 10 (DGEN time period means) - - Script 06 required before Script 07, 09, 11 (time interval differences) - - Script 11 required before Script 13 (domain-specific EHI items) - - Script 12 required before Script 14 (DGEN EHI scores) - - Script 13 required before Script 14 (EHI domain means) - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - - Scripts 09-14 include comprehensive QA with first 5 rows displayed - - All EHI scripts (11-14) verify calculations against stored values - - Pass/Fail status reported for all variables in QA-enabled scripts - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Scripts 08 and 09 save directly to eohi2.csv - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - "X" prefix added to variables starting with numbers (X5.10past_mean) - - Global means use "_global_" to distinguish from narrow-scope means - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -EHI CONCEPT AND INTERPRETATION -================================================================================ - -ENDURING HEDONIC IMPACT (EHI): - EHI measures the asymmetry between perceived past and future change in - psychological attributes. The concept is based on the premise that people - may perceive their past and future selves differently, even when considering - equivalent time distances. - -KEY EHI VARIABLES: - - Domain-Specific EHI (Scripts 11, 13, 14): - Calculated from item-level differences between past and future responses - Formula: NPast - NFut - * Positive values: Greater perceived change from past to present - * Negative values: Greater perceived change from present to future - * Zero: Symmetric perception of past and future change - - - Domain-General EHI (Scripts 12, 14): - Calculated from DGEN single-item responses - Formula: DGEN_past - DGEN_fut - * Measures broader temporal self-perception without item-level detail - -HIERARCHICAL STRUCTURE: - Level 1: Item-level EHI (45 domain-specific, 6 DGEN) - Level 2: Domain means (9 domain-specific, combining 5 items each) - Level 3: Global means (5 highest-level summaries) - -INTERPRETATION: - - EHI > 0: "Past asymmetry" - Person perceives greater change from past - - EHI < 0: "Future asymmetry" - Person perceives greater change to future - - EHI ≈ 0: "Temporal symmetry" - Balanced perception of past/future change - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 8, 2025 -Processing Pipeline: Scripts 01-14 - diff --git a/.history/eohi2/README_Variable_Creation_20251008171626.txt b/.history/eohi2/README_Variable_Creation_20251008171626.txt deleted file mode 100644 index da6d9d1..0000000 --- a/.history/eohi2/README_Variable_Creation_20251008171626.txt +++ /dev/null @@ -1,971 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 09 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SCRIPT 08: dataP 08 - DGEN 510 vars.r -================================================================================ - -PURPOSE: - Calculates absolute differences between 5-year and 10-year DGEN ratings for - both Past and Future time directions. These variables measure the perceived - difference in domain-general change between the two time intervals. - -VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - Total: 12 DGEN columns (created in Script 03) - -TARGET VARIABLES: - Past Direction (3 variables): - - X5_10DGEN_past_pref (|DGEN_past_5_Pref - DGEN_past_10_Pref|) - - X5_10DGEN_past_pers (|DGEN_past_5_Pers - DGEN_past_10_Pers|) - - X5_10DGEN_past_val (|DGEN_past_5_Val - DGEN_past_10_Val|) - - Future Direction (3 variables): - - X5_10DGEN_fut_pref (|DGEN_fut_5_Pref - DGEN_fut_10_Pref|) - - X5_10DGEN_fut_pers (|DGEN_fut_5_Pers - DGEN_fut_10_Pers|) - - X5_10DGEN_fut_val (|DGEN_fut_5_Val - DGEN_fut_10_Val|) - -TRANSFORMATION LOGIC: - Formula: |DGEN_5 - DGEN_10| - - All calculations use absolute differences: - - Past Preferences: |DGEN_past_5_Pref - DGEN_past_10_Pref| - - Past Personality: |DGEN_past_5_Pers - DGEN_past_10_Pers| - - Past Values: |DGEN_past_5_Val - DGEN_past_10_Val| - - Future Preferences: |DGEN_fut_5_Pref - DGEN_fut_10_Pref| - - Future Personality: |DGEN_fut_5_Pers - DGEN_fut_10_Pers| - - Future Values: |DGEN_fut_5_Val - DGEN_fut_10_Val| - - Result: Always positive values representing magnitude of difference - Missing values in either source column result in NA - -SPECIAL NOTES: - - Variable names use "X" prefix because R automatically adds it to column - names starting with numbers (5_10 becomes X5_10) - - This script depends on Script 03 being run first - - Measures interval effects within time direction (past vs future) - - Parallel to Script 06's 5.10past and 5.10fut variables but for DGEN scores - - -================================================================================ -SCRIPT 09: dataP 09 - interval x direction means.r -================================================================================ - -PURPOSE: - Calculates comprehensive mean scores by averaging item-level differences - across intervals and directions. Creates both narrow-scope means (single - time interval) and broad-scope global means (combining multiple intervals). - -VARIABLES CREATED: 11 total (6 narrow-scope + 5 global-scope) - -SOURCE COLUMNS: - All 90 difference variables created in Script 06: - - NPast_5_[domain]_[item] (15 variables) - - NPast_10_[domain]_[item] (15 variables) - - NFut_5_[domain]_[item] (15 variables) - - NFut_10_[domain]_[item] (15 variables) - - X5.10past_[domain]_[item] (15 variables) - - X5.10fut_[domain]_[item] (15 variables) - -TARGET VARIABLES: - - Narrow-Scope Means (15 source items each): - - NPast_5_mean (mean across all 15 NPast_5 items) - - NPast_10_mean (mean across all 15 NPast_10 items) - - NFut_5_mean (mean across all 15 NFut_5 items) - - NFut_10_mean (mean across all 15 NFut_10 items) - - X5.10past_mean (mean across all 15 X5.10past items) - - X5.10fut_mean (mean across all 15 X5.10fut items) - - Global-Scope Means (30 source items each): - - NPast_global_mean (NPast_5 + NPast_10: all past intervals) - - NFut_global_mean (NFut_5 + NFut_10: all future intervals) - - X5.10_global_mean (X5.10past + X5.10fut: all 5-vs-10 intervals) - - N5_global_mean (NPast_5 + NFut_5: all 5-year intervals) - - N10_global_mean (NPast_10 + NFut_10: all 10-year intervals) - -TRANSFORMATION LOGIC: - - Narrow-Scope Means (15 items each): - Each mean averages all 15 difference items within one time interval - - Example for NPast_5_mean: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel, - NPast_5_pers_extravert, NPast_5_pers_critical, - NPast_5_pers_dependable, NPast_5_pers_anxious, - NPast_5_pers_complex, - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice) - - Global-Scope Means (30 items each): - Each mean averages 30 difference items across two related intervals - - Example for NPast_global_mean: - = mean(all 15 NPast_5 items + all 15 NPast_10 items) - Represents overall perceived change from present to any past timepoint - - Example for N5_global_mean: - = mean(all 15 NPast_5 items + all 15 NFut_5 items) - Represents overall perceived change at 5-year interval regardless of - direction - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF INTERVAL × DIRECTION MEANS: - - Narrow-scope means: Single-interval summaries across all domains and items - - Global-scope means: Cross-interval summaries for testing: - * Direction effects (past vs future) - * Interval effects (5-year vs 10-year) - * Combined temporal distance effects - - Enables comprehensive analysis of temporal self-perception patterns - - Reduces item-level and domain-level noise through broad aggregation - -QUALITY ASSURANCE: - - Script includes automated QA checks for first 5 rows - - Manually recalculates each mean and verifies against stored values - - Prints TRUE/FALSE match status for each variable - - Ensures calculation accuracy before further analysis - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - All means are averages of absolute difference scores (non-negative) - - Global means provide the broadest temporal self-perception summaries - - Naming convention uses "global" for 30-item means, no suffix for 15-item - - -================================================================================ -SCRIPT 10: dataP 10 - DGEN mean vars.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging across different time combinations. - Creates means for Past, Future, and interval-based (5-year, 10-year) groupings. - -VARIABLES CREATED: 6 total - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - Direction-Based Means (2 variables): - - DGEN_past_mean (mean of past_5_mean and past_10_mean) - - DGEN_fut_mean (mean of fut_5_mean and fut_10_mean) - - Interval-Based Means (2 variables): - - DGEN_5_mean (mean of past_5_mean and fut_5_mean) - - DGEN_10_mean (mean of past_10_mean and fut_10_mean) - - Domain-Based Means (2 variables): - - DGEN_pref_mean (mean across all 4 time periods for Preferences) - - DGEN_pers_mean (mean across all 4 time periods for Personality) - -TRANSFORMATION LOGIC: - Direction-based: - - DGEN_past_mean = mean(DGEN_past_5_mean, DGEN_past_10_mean) - - DGEN_fut_mean = mean(DGEN_fut_5_mean, DGEN_fut_10_mean) - - Interval-based: - - DGEN_5_mean = mean(DGEN_past_5_mean, DGEN_fut_5_mean) - - DGEN_10_mean = mean(DGEN_past_10_mean, DGEN_fut_10_mean) - - Domain-based: - - DGEN_pref_mean = mean across all 4 Pref scores - - DGEN_pers_mean = mean across all 4 Pers scores - - NA values excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 11: dataP 11 - CORRECT ehi vars.r -================================================================================ - -PURPOSE: - Creates Enduring Hedonic Impact (EHI) variables by calculating differences - between Past and Future responses for each item across different time intervals. - Formula: NPast - NFut (positive values indicate greater past-present change) - -VARIABLES CREATED: 45 total (15 items × 3 time intervals) - -SOURCE COLUMNS: - 5-year intervals: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - 10-year intervals: - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5-10 year change: - - X5.10past_pref_read through X5.10past_val_justice (15 columns) - - X5.10fut_pref_read through X5.10fut_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year EHI Variables (15 variables): - - ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, ehi5_pref_nap, - ehi5_pref_travel - - ehi5_pers_extravert, ehi5_pers_critical, ehi5_pers_dependable, - ehi5_pers_anxious, ehi5_pers_complex - - ehi5_val_obey, ehi5_val_trad, ehi5_val_opinion, ehi5_val_performance, - ehi5_val_justice - - 10-Year EHI Variables (15 variables): - - ehi10_pref_read, ehi10_pref_music, ehi10_pref_TV, ehi10_pref_nap, - ehi10_pref_travel - - ehi10_pers_extravert, ehi10_pers_critical, ehi10_pers_dependable, - ehi10_pers_anxious, ehi10_pers_complex - - ehi10_val_obey, ehi10_val_trad, ehi10_val_opinion, ehi10_val_performance, - ehi10_val_justice - - 5-10 Year Change EHI Variables (15 variables): - - ehi5.10_pref_read, ehi5.10_pref_music, ehi5.10_pref_TV, ehi5.10_pref_nap, - ehi5.10_pref_travel - - ehi5.10_pers_extravert, ehi5.10_pers_critical, ehi5.10_pers_dependable, - ehi5.10_pers_anxious, ehi5.10_pers_complex - - ehi5.10_val_obey, ehi5.10_val_trad, ehi5.10_val_opinion, - ehi5.10_val_performance, ehi5.10_val_justice - -TRANSFORMATION LOGIC: - Formula: NPast - NFut - - All calculations use signed differences: - - ehi5_[item] = NPast_5_[item] - NFut_5_[item] - - ehi10_[item] = NPast_10_[item] - NFut_10_[item] - - ehi5.10_[item] = X5.10past_[item] - X5.10fut_[item] - - Result: Positive = greater past change, Negative = greater future change - Missing values in either source column result in NA - -QUALITY ASSURANCE: - - Comprehensive QA checks for all 45 variables across all rows - - First 5 rows displayed with detailed calculations showing source values, - computed differences, and stored values - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 12: dataP 12 - CORRECT DGEN ehi vars.r -================================================================================ - -PURPOSE: - Creates domain-general EHI variables by calculating differences between Past - and Future DGEN responses. These are the domain-general parallel to Script 11's - domain-specific EHI variables. - -VARIABLES CREATED: 6 total (3 domains × 2 time intervals) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - 5-Year DGEN EHI (3 variables): - - ehiDGEN_5_Pref - - ehiDGEN_5_Pers - - ehiDGEN_5_Val - - 10-Year DGEN EHI (3 variables): - - ehiDGEN_10_Pref - - ehiDGEN_10_Pers - - ehiDGEN_10_Val - -TRANSFORMATION LOGIC: - Formula: DGEN_past - DGEN_fut - - All calculations use signed differences: - - ehiDGEN_5_Pref = DGEN_past_5_Pref - DGEN_fut_5_Pref - - ehiDGEN_5_Pers = DGEN_past_5_Pers - DGEN_fut_5_Pers - - ehiDGEN_5_Val = DGEN_past_5_Val - DGEN_fut_5_Val - - ehiDGEN_10_Pref = DGEN_past_10_Pref - DGEN_fut_10_Pref - - ehiDGEN_10_Pers = DGEN_past_10_Pers - DGEN_fut_10_Pers - - ehiDGEN_10_Val = DGEN_past_10_Val - DGEN_fut_10_Val - - Result: Positive = greater past change, Negative = greater future change - -QUALITY ASSURANCE: - - QA checks for all 6 variables across all rows - - First 5 rows displayed with detailed calculations - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 13: datap 13 - ehi domain specific means.r -================================================================================ - -PURPOSE: - Calculates domain-level mean EHI scores by averaging the 5 items within each - domain (Preferences, Personality, Values) for each time interval. - -VARIABLES CREATED: 9 total (3 domains × 3 time intervals) - -SOURCE COLUMNS: - - ehi5_pref_read through ehi5_val_justice (15 columns) - - ehi10_pref_read through ehi10_val_justice (15 columns) - - ehi5.10_pref_read through ehi5.10_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year Domain Means (3 variables): - - ehi5_pref_MEAN (mean of 5 preference items) - - ehi5_pers_MEAN (mean of 5 personality items) - - ehi5_val_MEAN (mean of 5 values items) - - 10-Year Domain Means (3 variables): - - ehi10_pref_MEAN - - ehi10_pers_MEAN - - ehi10_val_MEAN - - 5-10 Year Change Domain Means (3 variables): - - ehi5.10_pref_MEAN - - ehi5.10_pers_MEAN - - ehi5.10_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for ehi5_pref_MEAN: - = mean(ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, - ehi5_pref_nap, ehi5_pref_travel) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - Comprehensive QA for all 9 variables across all rows - - First 5 rows displayed for multiple domain means - - Pass/Fail status for each variable - - -================================================================================ -SCRIPT 14: datap 14 - all ehi global means.r -================================================================================ - -PURPOSE: - Calculates global EHI means by averaging domain-level means. Creates the - highest-level summary scores for EHI across both domain-general and - domain-specific measures. - -VARIABLES CREATED: 5 total - -SOURCE COLUMNS: - - ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val - - ehiDGEN_10_Pref, ehiDGEN_10_Pers, ehiDGEN_10_Val - - ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN - - ehi10_pref_MEAN, ehi10_pers_MEAN, ehi10_val_MEAN - - ehi5.10_pref_MEAN, ehi5.10_pers_MEAN, ehi5.10_val_MEAN - -TARGET VARIABLES: - DGEN Global Means (2 variables): - - ehiDGEN_5_mean (mean of 3 DGEN domains for 5-year) - - ehiDGEN_10_mean (mean of 3 DGEN domains for 10-year) - - Domain-Specific Global Means (3 variables): - - ehi5_global_mean (mean of 3 domain means for 5-year) - - ehi10_global_mean (mean of 3 domain means for 10-year) - - ehi5.10_global_mean (mean of 3 domain means for 5-10 change) - -TRANSFORMATION LOGIC: - Each global mean = average of 3 domain-level scores - - Example for ehiDGEN_5_mean: - = mean(ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val) - - Example for ehi5_global_mean: - = mean(ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - QA for all 5 global means across all rows - - First 5 rows displayed with detailed calculations - - Values shown with 5 decimal precision - - Pass/Fail status for each variable - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 285 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN time period means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - - Script 08: 6 variables (DGEN 5-vs-10 differences) - - Script 09: 11 variables (interval × direction means) - - Script 10: 6 variables (DGEN combined means) - - Script 11: 45 variables (domain-specific EHI scores) - - Script 12: 6 variables (DGEN EHI scores) - - Script 13: 9 variables (EHI domain means) - - Script 14: 5 variables (EHI global means) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (28 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Time period means: 4 (1 per time period) - * 5-vs-10 differences: 6 (3 domains × 2 directions) - * Combined means: 6 (past, future, interval-based, domain-based) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - - Interval × Direction Means (11 total): - * Narrow-scope means: 6 (NPast_5, NPast_10, NFut_5, NFut_10, - X5.10past, X5.10fut) - * Global-scope means: 5 (NPast_global, NFut_global, X5.10_global, - N5_global, N10_global) - - - EHI Variables (60 total): - * Domain-specific EHI: 45 (15 items × 3 time intervals) - * DGEN EHI: 6 (3 domains × 2 time intervals) - * Domain means: 9 (3 domains × 3 time intervals) - * Global means: 5 (2 DGEN + 3 domain-specific) - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 14) as later scripts depend - on variables created by earlier scripts. - - Key Dependencies: - - Script 03 required before Script 04, 08, 10, 12 (DGEN scores) - - Script 04 required before Script 10 (DGEN time period means) - - Script 06 required before Script 07, 09, 11 (time interval differences) - - Script 11 required before Script 13 (domain-specific EHI items) - - Script 12 required before Script 14 (DGEN EHI scores) - - Script 13 required before Script 14 (EHI domain means) - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - - Scripts 09-14 include comprehensive QA with first 5 rows displayed - - All EHI scripts (11-14) verify calculations against stored values - - Pass/Fail status reported for all variables in QA-enabled scripts - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Scripts 08 and 09 save directly to eohi2.csv - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - "X" prefix added to variables starting with numbers (X5.10past_mean) - - Global means use "_global_" to distinguish from narrow-scope means - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -EHI CONCEPT AND INTERPRETATION -================================================================================ - -ENDURING HEDONIC IMPACT (EHI): - EHI measures the asymmetry between perceived past and future change in - psychological attributes. The concept is based on the premise that people - may perceive their past and future selves differently, even when considering - equivalent time distances. - -KEY EHI VARIABLES: - - Domain-Specific EHI (Scripts 11, 13, 14): - Calculated from item-level differences between past and future responses - Formula: NPast - NFut - * Positive values: Greater perceived change from past to present - * Negative values: Greater perceived change from present to future - * Zero: Symmetric perception of past and future change - - - Domain-General EHI (Scripts 12, 14): - Calculated from DGEN single-item responses - Formula: DGEN_past - DGEN_fut - * Measures broader temporal self-perception without item-level detail - -HIERARCHICAL STRUCTURE: - Level 1: Item-level EHI (45 domain-specific, 6 DGEN) - Level 2: Domain means (9 domain-specific, combining 5 items each) - Level 3: Global means (5 highest-level summaries) - -INTERPRETATION: - - EHI > 0: "Past asymmetry" - Person perceives greater change from past - - EHI < 0: "Future asymmetry" - Person perceives greater change to future - - EHI ≈ 0: "Temporal symmetry" - Balanced perception of past/future change - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 8, 2025 -Processing Pipeline: Scripts 01-14 - diff --git a/.history/eohi2/README_Variable_Creation_20251008171628.txt b/.history/eohi2/README_Variable_Creation_20251008171628.txt deleted file mode 100644 index da6d9d1..0000000 --- a/.history/eohi2/README_Variable_Creation_20251008171628.txt +++ /dev/null @@ -1,971 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 09 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SCRIPT 08: dataP 08 - DGEN 510 vars.r -================================================================================ - -PURPOSE: - Calculates absolute differences between 5-year and 10-year DGEN ratings for - both Past and Future time directions. These variables measure the perceived - difference in domain-general change between the two time intervals. - -VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - Total: 12 DGEN columns (created in Script 03) - -TARGET VARIABLES: - Past Direction (3 variables): - - X5_10DGEN_past_pref (|DGEN_past_5_Pref - DGEN_past_10_Pref|) - - X5_10DGEN_past_pers (|DGEN_past_5_Pers - DGEN_past_10_Pers|) - - X5_10DGEN_past_val (|DGEN_past_5_Val - DGEN_past_10_Val|) - - Future Direction (3 variables): - - X5_10DGEN_fut_pref (|DGEN_fut_5_Pref - DGEN_fut_10_Pref|) - - X5_10DGEN_fut_pers (|DGEN_fut_5_Pers - DGEN_fut_10_Pers|) - - X5_10DGEN_fut_val (|DGEN_fut_5_Val - DGEN_fut_10_Val|) - -TRANSFORMATION LOGIC: - Formula: |DGEN_5 - DGEN_10| - - All calculations use absolute differences: - - Past Preferences: |DGEN_past_5_Pref - DGEN_past_10_Pref| - - Past Personality: |DGEN_past_5_Pers - DGEN_past_10_Pers| - - Past Values: |DGEN_past_5_Val - DGEN_past_10_Val| - - Future Preferences: |DGEN_fut_5_Pref - DGEN_fut_10_Pref| - - Future Personality: |DGEN_fut_5_Pers - DGEN_fut_10_Pers| - - Future Values: |DGEN_fut_5_Val - DGEN_fut_10_Val| - - Result: Always positive values representing magnitude of difference - Missing values in either source column result in NA - -SPECIAL NOTES: - - Variable names use "X" prefix because R automatically adds it to column - names starting with numbers (5_10 becomes X5_10) - - This script depends on Script 03 being run first - - Measures interval effects within time direction (past vs future) - - Parallel to Script 06's 5.10past and 5.10fut variables but for DGEN scores - - -================================================================================ -SCRIPT 09: dataP 09 - interval x direction means.r -================================================================================ - -PURPOSE: - Calculates comprehensive mean scores by averaging item-level differences - across intervals and directions. Creates both narrow-scope means (single - time interval) and broad-scope global means (combining multiple intervals). - -VARIABLES CREATED: 11 total (6 narrow-scope + 5 global-scope) - -SOURCE COLUMNS: - All 90 difference variables created in Script 06: - - NPast_5_[domain]_[item] (15 variables) - - NPast_10_[domain]_[item] (15 variables) - - NFut_5_[domain]_[item] (15 variables) - - NFut_10_[domain]_[item] (15 variables) - - X5.10past_[domain]_[item] (15 variables) - - X5.10fut_[domain]_[item] (15 variables) - -TARGET VARIABLES: - - Narrow-Scope Means (15 source items each): - - NPast_5_mean (mean across all 15 NPast_5 items) - - NPast_10_mean (mean across all 15 NPast_10 items) - - NFut_5_mean (mean across all 15 NFut_5 items) - - NFut_10_mean (mean across all 15 NFut_10 items) - - X5.10past_mean (mean across all 15 X5.10past items) - - X5.10fut_mean (mean across all 15 X5.10fut items) - - Global-Scope Means (30 source items each): - - NPast_global_mean (NPast_5 + NPast_10: all past intervals) - - NFut_global_mean (NFut_5 + NFut_10: all future intervals) - - X5.10_global_mean (X5.10past + X5.10fut: all 5-vs-10 intervals) - - N5_global_mean (NPast_5 + NFut_5: all 5-year intervals) - - N10_global_mean (NPast_10 + NFut_10: all 10-year intervals) - -TRANSFORMATION LOGIC: - - Narrow-Scope Means (15 items each): - Each mean averages all 15 difference items within one time interval - - Example for NPast_5_mean: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel, - NPast_5_pers_extravert, NPast_5_pers_critical, - NPast_5_pers_dependable, NPast_5_pers_anxious, - NPast_5_pers_complex, - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice) - - Global-Scope Means (30 items each): - Each mean averages 30 difference items across two related intervals - - Example for NPast_global_mean: - = mean(all 15 NPast_5 items + all 15 NPast_10 items) - Represents overall perceived change from present to any past timepoint - - Example for N5_global_mean: - = mean(all 15 NPast_5 items + all 15 NFut_5 items) - Represents overall perceived change at 5-year interval regardless of - direction - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF INTERVAL × DIRECTION MEANS: - - Narrow-scope means: Single-interval summaries across all domains and items - - Global-scope means: Cross-interval summaries for testing: - * Direction effects (past vs future) - * Interval effects (5-year vs 10-year) - * Combined temporal distance effects - - Enables comprehensive analysis of temporal self-perception patterns - - Reduces item-level and domain-level noise through broad aggregation - -QUALITY ASSURANCE: - - Script includes automated QA checks for first 5 rows - - Manually recalculates each mean and verifies against stored values - - Prints TRUE/FALSE match status for each variable - - Ensures calculation accuracy before further analysis - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - All means are averages of absolute difference scores (non-negative) - - Global means provide the broadest temporal self-perception summaries - - Naming convention uses "global" for 30-item means, no suffix for 15-item - - -================================================================================ -SCRIPT 10: dataP 10 - DGEN mean vars.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging across different time combinations. - Creates means for Past, Future, and interval-based (5-year, 10-year) groupings. - -VARIABLES CREATED: 6 total - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - Direction-Based Means (2 variables): - - DGEN_past_mean (mean of past_5_mean and past_10_mean) - - DGEN_fut_mean (mean of fut_5_mean and fut_10_mean) - - Interval-Based Means (2 variables): - - DGEN_5_mean (mean of past_5_mean and fut_5_mean) - - DGEN_10_mean (mean of past_10_mean and fut_10_mean) - - Domain-Based Means (2 variables): - - DGEN_pref_mean (mean across all 4 time periods for Preferences) - - DGEN_pers_mean (mean across all 4 time periods for Personality) - -TRANSFORMATION LOGIC: - Direction-based: - - DGEN_past_mean = mean(DGEN_past_5_mean, DGEN_past_10_mean) - - DGEN_fut_mean = mean(DGEN_fut_5_mean, DGEN_fut_10_mean) - - Interval-based: - - DGEN_5_mean = mean(DGEN_past_5_mean, DGEN_fut_5_mean) - - DGEN_10_mean = mean(DGEN_past_10_mean, DGEN_fut_10_mean) - - Domain-based: - - DGEN_pref_mean = mean across all 4 Pref scores - - DGEN_pers_mean = mean across all 4 Pers scores - - NA values excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 11: dataP 11 - CORRECT ehi vars.r -================================================================================ - -PURPOSE: - Creates Enduring Hedonic Impact (EHI) variables by calculating differences - between Past and Future responses for each item across different time intervals. - Formula: NPast - NFut (positive values indicate greater past-present change) - -VARIABLES CREATED: 45 total (15 items × 3 time intervals) - -SOURCE COLUMNS: - 5-year intervals: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - 10-year intervals: - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5-10 year change: - - X5.10past_pref_read through X5.10past_val_justice (15 columns) - - X5.10fut_pref_read through X5.10fut_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year EHI Variables (15 variables): - - ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, ehi5_pref_nap, - ehi5_pref_travel - - ehi5_pers_extravert, ehi5_pers_critical, ehi5_pers_dependable, - ehi5_pers_anxious, ehi5_pers_complex - - ehi5_val_obey, ehi5_val_trad, ehi5_val_opinion, ehi5_val_performance, - ehi5_val_justice - - 10-Year EHI Variables (15 variables): - - ehi10_pref_read, ehi10_pref_music, ehi10_pref_TV, ehi10_pref_nap, - ehi10_pref_travel - - ehi10_pers_extravert, ehi10_pers_critical, ehi10_pers_dependable, - ehi10_pers_anxious, ehi10_pers_complex - - ehi10_val_obey, ehi10_val_trad, ehi10_val_opinion, ehi10_val_performance, - ehi10_val_justice - - 5-10 Year Change EHI Variables (15 variables): - - ehi5.10_pref_read, ehi5.10_pref_music, ehi5.10_pref_TV, ehi5.10_pref_nap, - ehi5.10_pref_travel - - ehi5.10_pers_extravert, ehi5.10_pers_critical, ehi5.10_pers_dependable, - ehi5.10_pers_anxious, ehi5.10_pers_complex - - ehi5.10_val_obey, ehi5.10_val_trad, ehi5.10_val_opinion, - ehi5.10_val_performance, ehi5.10_val_justice - -TRANSFORMATION LOGIC: - Formula: NPast - NFut - - All calculations use signed differences: - - ehi5_[item] = NPast_5_[item] - NFut_5_[item] - - ehi10_[item] = NPast_10_[item] - NFut_10_[item] - - ehi5.10_[item] = X5.10past_[item] - X5.10fut_[item] - - Result: Positive = greater past change, Negative = greater future change - Missing values in either source column result in NA - -QUALITY ASSURANCE: - - Comprehensive QA checks for all 45 variables across all rows - - First 5 rows displayed with detailed calculations showing source values, - computed differences, and stored values - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 12: dataP 12 - CORRECT DGEN ehi vars.r -================================================================================ - -PURPOSE: - Creates domain-general EHI variables by calculating differences between Past - and Future DGEN responses. These are the domain-general parallel to Script 11's - domain-specific EHI variables. - -VARIABLES CREATED: 6 total (3 domains × 2 time intervals) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - 5-Year DGEN EHI (3 variables): - - ehiDGEN_5_Pref - - ehiDGEN_5_Pers - - ehiDGEN_5_Val - - 10-Year DGEN EHI (3 variables): - - ehiDGEN_10_Pref - - ehiDGEN_10_Pers - - ehiDGEN_10_Val - -TRANSFORMATION LOGIC: - Formula: DGEN_past - DGEN_fut - - All calculations use signed differences: - - ehiDGEN_5_Pref = DGEN_past_5_Pref - DGEN_fut_5_Pref - - ehiDGEN_5_Pers = DGEN_past_5_Pers - DGEN_fut_5_Pers - - ehiDGEN_5_Val = DGEN_past_5_Val - DGEN_fut_5_Val - - ehiDGEN_10_Pref = DGEN_past_10_Pref - DGEN_fut_10_Pref - - ehiDGEN_10_Pers = DGEN_past_10_Pers - DGEN_fut_10_Pers - - ehiDGEN_10_Val = DGEN_past_10_Val - DGEN_fut_10_Val - - Result: Positive = greater past change, Negative = greater future change - -QUALITY ASSURANCE: - - QA checks for all 6 variables across all rows - - First 5 rows displayed with detailed calculations - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 13: datap 13 - ehi domain specific means.r -================================================================================ - -PURPOSE: - Calculates domain-level mean EHI scores by averaging the 5 items within each - domain (Preferences, Personality, Values) for each time interval. - -VARIABLES CREATED: 9 total (3 domains × 3 time intervals) - -SOURCE COLUMNS: - - ehi5_pref_read through ehi5_val_justice (15 columns) - - ehi10_pref_read through ehi10_val_justice (15 columns) - - ehi5.10_pref_read through ehi5.10_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year Domain Means (3 variables): - - ehi5_pref_MEAN (mean of 5 preference items) - - ehi5_pers_MEAN (mean of 5 personality items) - - ehi5_val_MEAN (mean of 5 values items) - - 10-Year Domain Means (3 variables): - - ehi10_pref_MEAN - - ehi10_pers_MEAN - - ehi10_val_MEAN - - 5-10 Year Change Domain Means (3 variables): - - ehi5.10_pref_MEAN - - ehi5.10_pers_MEAN - - ehi5.10_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for ehi5_pref_MEAN: - = mean(ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, - ehi5_pref_nap, ehi5_pref_travel) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - Comprehensive QA for all 9 variables across all rows - - First 5 rows displayed for multiple domain means - - Pass/Fail status for each variable - - -================================================================================ -SCRIPT 14: datap 14 - all ehi global means.r -================================================================================ - -PURPOSE: - Calculates global EHI means by averaging domain-level means. Creates the - highest-level summary scores for EHI across both domain-general and - domain-specific measures. - -VARIABLES CREATED: 5 total - -SOURCE COLUMNS: - - ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val - - ehiDGEN_10_Pref, ehiDGEN_10_Pers, ehiDGEN_10_Val - - ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN - - ehi10_pref_MEAN, ehi10_pers_MEAN, ehi10_val_MEAN - - ehi5.10_pref_MEAN, ehi5.10_pers_MEAN, ehi5.10_val_MEAN - -TARGET VARIABLES: - DGEN Global Means (2 variables): - - ehiDGEN_5_mean (mean of 3 DGEN domains for 5-year) - - ehiDGEN_10_mean (mean of 3 DGEN domains for 10-year) - - Domain-Specific Global Means (3 variables): - - ehi5_global_mean (mean of 3 domain means for 5-year) - - ehi10_global_mean (mean of 3 domain means for 10-year) - - ehi5.10_global_mean (mean of 3 domain means for 5-10 change) - -TRANSFORMATION LOGIC: - Each global mean = average of 3 domain-level scores - - Example for ehiDGEN_5_mean: - = mean(ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val) - - Example for ehi5_global_mean: - = mean(ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - QA for all 5 global means across all rows - - First 5 rows displayed with detailed calculations - - Values shown with 5 decimal precision - - Pass/Fail status for each variable - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 285 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN time period means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - - Script 08: 6 variables (DGEN 5-vs-10 differences) - - Script 09: 11 variables (interval × direction means) - - Script 10: 6 variables (DGEN combined means) - - Script 11: 45 variables (domain-specific EHI scores) - - Script 12: 6 variables (DGEN EHI scores) - - Script 13: 9 variables (EHI domain means) - - Script 14: 5 variables (EHI global means) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (28 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Time period means: 4 (1 per time period) - * 5-vs-10 differences: 6 (3 domains × 2 directions) - * Combined means: 6 (past, future, interval-based, domain-based) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - - Interval × Direction Means (11 total): - * Narrow-scope means: 6 (NPast_5, NPast_10, NFut_5, NFut_10, - X5.10past, X5.10fut) - * Global-scope means: 5 (NPast_global, NFut_global, X5.10_global, - N5_global, N10_global) - - - EHI Variables (60 total): - * Domain-specific EHI: 45 (15 items × 3 time intervals) - * DGEN EHI: 6 (3 domains × 2 time intervals) - * Domain means: 9 (3 domains × 3 time intervals) - * Global means: 5 (2 DGEN + 3 domain-specific) - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 14) as later scripts depend - on variables created by earlier scripts. - - Key Dependencies: - - Script 03 required before Script 04, 08, 10, 12 (DGEN scores) - - Script 04 required before Script 10 (DGEN time period means) - - Script 06 required before Script 07, 09, 11 (time interval differences) - - Script 11 required before Script 13 (domain-specific EHI items) - - Script 12 required before Script 14 (DGEN EHI scores) - - Script 13 required before Script 14 (EHI domain means) - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - - Scripts 09-14 include comprehensive QA with first 5 rows displayed - - All EHI scripts (11-14) verify calculations against stored values - - Pass/Fail status reported for all variables in QA-enabled scripts - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Scripts 08 and 09 save directly to eohi2.csv - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - "X" prefix added to variables starting with numbers (X5.10past_mean) - - Global means use "_global_" to distinguish from narrow-scope means - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -EHI CONCEPT AND INTERPRETATION -================================================================================ - -ENDURING HEDONIC IMPACT (EHI): - EHI measures the asymmetry between perceived past and future change in - psychological attributes. The concept is based on the premise that people - may perceive their past and future selves differently, even when considering - equivalent time distances. - -KEY EHI VARIABLES: - - Domain-Specific EHI (Scripts 11, 13, 14): - Calculated from item-level differences between past and future responses - Formula: NPast - NFut - * Positive values: Greater perceived change from past to present - * Negative values: Greater perceived change from present to future - * Zero: Symmetric perception of past and future change - - - Domain-General EHI (Scripts 12, 14): - Calculated from DGEN single-item responses - Formula: DGEN_past - DGEN_fut - * Measures broader temporal self-perception without item-level detail - -HIERARCHICAL STRUCTURE: - Level 1: Item-level EHI (45 domain-specific, 6 DGEN) - Level 2: Domain means (9 domain-specific, combining 5 items each) - Level 3: Global means (5 highest-level summaries) - -INTERPRETATION: - - EHI > 0: "Past asymmetry" - Person perceives greater change from past - - EHI < 0: "Future asymmetry" - Person perceives greater change to future - - EHI ≈ 0: "Temporal symmetry" - Balanced perception of past/future change - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 8, 2025 -Processing Pipeline: Scripts 01-14 - diff --git a/.history/eohi2/README_Variable_Creation_20251029133334.txt b/.history/eohi2/README_Variable_Creation_20251029133334.txt deleted file mode 100644 index 7fb45fb..0000000 --- a/.history/eohi2/README_Variable_Creation_20251029133334.txt +++ /dev/null @@ -1,1044 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 09 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SCRIPT 08: dataP 08 - DGEN 510 vars.r -================================================================================ - -PURPOSE: - Calculates absolute differences between 5-year and 10-year DGEN ratings for - both Past and Future time directions. These variables measure the perceived - difference in domain-general change between the two time intervals. - -VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - Total: 12 DGEN columns (created in Script 03) - -TARGET VARIABLES: - Past Direction (3 variables): - - X5_10DGEN_past_pref (|DGEN_past_5_Pref - DGEN_past_10_Pref|) - - X5_10DGEN_past_pers (|DGEN_past_5_Pers - DGEN_past_10_Pers|) - - X5_10DGEN_past_val (|DGEN_past_5_Val - DGEN_past_10_Val|) - - Future Direction (3 variables): - - X5_10DGEN_fut_pref (|DGEN_fut_5_Pref - DGEN_fut_10_Pref|) - - X5_10DGEN_fut_pers (|DGEN_fut_5_Pers - DGEN_fut_10_Pers|) - - X5_10DGEN_fut_val (|DGEN_fut_5_Val - DGEN_fut_10_Val|) - -TRANSFORMATION LOGIC: - Formula: |DGEN_5 - DGEN_10| - - All calculations use absolute differences: - - Past Preferences: |DGEN_past_5_Pref - DGEN_past_10_Pref| - - Past Personality: |DGEN_past_5_Pers - DGEN_past_10_Pers| - - Past Values: |DGEN_past_5_Val - DGEN_past_10_Val| - - Future Preferences: |DGEN_fut_5_Pref - DGEN_fut_10_Pref| - - Future Personality: |DGEN_fut_5_Pers - DGEN_fut_10_Pers| - - Future Values: |DGEN_fut_5_Val - DGEN_fut_10_Val| - - Result: Always positive values representing magnitude of difference - Missing values in either source column result in NA - -SPECIAL NOTES: - - Variable names use "X" prefix because R automatically adds it to column - names starting with numbers (5_10 becomes X5_10) - - This script depends on Script 03 being run first - - Measures interval effects within time direction (past vs future) - - Parallel to Script 06's 5.10past and 5.10fut variables but for DGEN scores - - -================================================================================ -SCRIPT 09: dataP 09 - interval x direction means.r -================================================================================ - -PURPOSE: - Calculates comprehensive mean scores by averaging item-level differences - across intervals and directions. Creates both narrow-scope means (single - time interval) and broad-scope global means (combining multiple intervals). - -VARIABLES CREATED: 11 total (6 narrow-scope + 5 global-scope) - -SOURCE COLUMNS: - All 90 difference variables created in Script 06: - - NPast_5_[domain]_[item] (15 variables) - - NPast_10_[domain]_[item] (15 variables) - - NFut_5_[domain]_[item] (15 variables) - - NFut_10_[domain]_[item] (15 variables) - - X5.10past_[domain]_[item] (15 variables) - - X5.10fut_[domain]_[item] (15 variables) - -TARGET VARIABLES: - - Narrow-Scope Means (15 source items each): - - NPast_5_mean (mean across all 15 NPast_5 items) - - NPast_10_mean (mean across all 15 NPast_10 items) - - NFut_5_mean (mean across all 15 NFut_5 items) - - NFut_10_mean (mean across all 15 NFut_10 items) - - X5.10past_mean (mean across all 15 X5.10past items) - - X5.10fut_mean (mean across all 15 X5.10fut items) - - Global-Scope Means (30 source items each): - - NPast_global_mean (NPast_5 + NPast_10: all past intervals) - - NFut_global_mean (NFut_5 + NFut_10: all future intervals) - - X5.10_global_mean (X5.10past + X5.10fut: all 5-vs-10 intervals) - - N5_global_mean (NPast_5 + NFut_5: all 5-year intervals) - - N10_global_mean (NPast_10 + NFut_10: all 10-year intervals) - -TRANSFORMATION LOGIC: - - Narrow-Scope Means (15 items each): - Each mean averages all 15 difference items within one time interval - - Example for NPast_5_mean: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel, - NPast_5_pers_extravert, NPast_5_pers_critical, - NPast_5_pers_dependable, NPast_5_pers_anxious, - NPast_5_pers_complex, - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice) - - Global-Scope Means (30 items each): - Each mean averages 30 difference items across two related intervals - - Example for NPast_global_mean: - = mean(all 15 NPast_5 items + all 15 NPast_10 items) - Represents overall perceived change from present to any past timepoint - - Example for N5_global_mean: - = mean(all 15 NPast_5 items + all 15 NFut_5 items) - Represents overall perceived change at 5-year interval regardless of - direction - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF INTERVAL × DIRECTION MEANS: - - Narrow-scope means: Single-interval summaries across all domains and items - - Global-scope means: Cross-interval summaries for testing: - * Direction effects (past vs future) - * Interval effects (5-year vs 10-year) - * Combined temporal distance effects - - Enables comprehensive analysis of temporal self-perception patterns - - Reduces item-level and domain-level noise through broad aggregation - -QUALITY ASSURANCE: - - Script includes automated QA checks for first 5 rows - - Manually recalculates each mean and verifies against stored values - - Prints TRUE/FALSE match status for each variable - - Ensures calculation accuracy before further analysis - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - All means are averages of absolute difference scores (non-negative) - - Global means provide the broadest temporal self-perception summaries - - Naming convention uses "global" for 30-item means, no suffix for 15-item - - -================================================================================ -SCRIPT 10: dataP 10 - DGEN mean vars.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging across different time combinations. - Creates means for Past, Future, and interval-based (5-year, 10-year) groupings. - -VARIABLES CREATED: 6 total - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - Direction-Based Means (2 variables): - - DGEN_past_mean (mean of past_5_mean and past_10_mean) - - DGEN_fut_mean (mean of fut_5_mean and fut_10_mean) - - Interval-Based Means (2 variables): - - DGEN_5_mean (mean of past_5_mean and fut_5_mean) - - DGEN_10_mean (mean of past_10_mean and fut_10_mean) - - Domain-Based Means (2 variables): - - DGEN_pref_mean (mean across all 4 time periods for Preferences) - - DGEN_pers_mean (mean across all 4 time periods for Personality) - -TRANSFORMATION LOGIC: - Direction-based: - - DGEN_past_mean = mean(DGEN_past_5_mean, DGEN_past_10_mean) - - DGEN_fut_mean = mean(DGEN_fut_5_mean, DGEN_fut_10_mean) - - Interval-based: - - DGEN_5_mean = mean(DGEN_past_5_mean, DGEN_fut_5_mean) - - DGEN_10_mean = mean(DGEN_past_10_mean, DGEN_fut_10_mean) - - Domain-based: - - DGEN_pref_mean = mean across all 4 Pref scores - - DGEN_pers_mean = mean across all 4 Pers scores - - NA values excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 11: dataP 11 - CORRECT ehi vars.r -================================================================================ - -PURPOSE: - Creates Enduring Hedonic Impact (EHI) variables by calculating differences - between Past and Future responses for each item across different time intervals. - Formula: NPast - NFut (positive values indicate greater past-present change) - -VARIABLES CREATED: 45 total (15 items × 3 time intervals) - -SOURCE COLUMNS: - 5-year intervals: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - 10-year intervals: - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5-10 year change: - - X5.10past_pref_read through X5.10past_val_justice (15 columns) - - X5.10fut_pref_read through X5.10fut_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year EHI Variables (15 variables): - - ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, ehi5_pref_nap, - ehi5_pref_travel - - ehi5_pers_extravert, ehi5_pers_critical, ehi5_pers_dependable, - ehi5_pers_anxious, ehi5_pers_complex - - ehi5_val_obey, ehi5_val_trad, ehi5_val_opinion, ehi5_val_performance, - ehi5_val_justice - - 10-Year EHI Variables (15 variables): - - ehi10_pref_read, ehi10_pref_music, ehi10_pref_TV, ehi10_pref_nap, - ehi10_pref_travel - - ehi10_pers_extravert, ehi10_pers_critical, ehi10_pers_dependable, - ehi10_pers_anxious, ehi10_pers_complex - - ehi10_val_obey, ehi10_val_trad, ehi10_val_opinion, ehi10_val_performance, - ehi10_val_justice - - 5-10 Year Change EHI Variables (15 variables): - - ehi5.10_pref_read, ehi5.10_pref_music, ehi5.10_pref_TV, ehi5.10_pref_nap, - ehi5.10_pref_travel - - ehi5.10_pers_extravert, ehi5.10_pers_critical, ehi5.10_pers_dependable, - ehi5.10_pers_anxious, ehi5.10_pers_complex - - ehi5.10_val_obey, ehi5.10_val_trad, ehi5.10_val_opinion, - ehi5.10_val_performance, ehi5.10_val_justice - -TRANSFORMATION LOGIC: - Formula: NPast - NFut - - All calculations use signed differences: - - ehi5_[item] = NPast_5_[item] - NFut_5_[item] - - ehi10_[item] = NPast_10_[item] - NFut_10_[item] - - ehi5.10_[item] = X5.10past_[item] - X5.10fut_[item] - - Result: Positive = greater past change, Negative = greater future change - Missing values in either source column result in NA - -QUALITY ASSURANCE: - - Comprehensive QA checks for all 45 variables across all rows - - First 5 rows displayed with detailed calculations showing source values, - computed differences, and stored values - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 12: dataP 12 - CORRECT DGEN ehi vars.r -================================================================================ - -PURPOSE: - Creates domain-general EHI variables by calculating differences between Past - and Future DGEN responses. These are the domain-general parallel to Script 11's - domain-specific EHI variables. - -VARIABLES CREATED: 6 total (3 domains × 2 time intervals) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - 5-Year DGEN EHI (3 variables): - - ehiDGEN_5_Pref - - ehiDGEN_5_Pers - - ehiDGEN_5_Val - - 10-Year DGEN EHI (3 variables): - - ehiDGEN_10_Pref - - ehiDGEN_10_Pers - - ehiDGEN_10_Val - -TRANSFORMATION LOGIC: - Formula: DGEN_past - DGEN_fut - - All calculations use signed differences: - - ehiDGEN_5_Pref = DGEN_past_5_Pref - DGEN_fut_5_Pref - - ehiDGEN_5_Pers = DGEN_past_5_Pers - DGEN_fut_5_Pers - - ehiDGEN_5_Val = DGEN_past_5_Val - DGEN_fut_5_Val - - ehiDGEN_10_Pref = DGEN_past_10_Pref - DGEN_fut_10_Pref - - ehiDGEN_10_Pers = DGEN_past_10_Pers - DGEN_fut_10_Pers - - ehiDGEN_10_Val = DGEN_past_10_Val - DGEN_fut_10_Val - - Result: Positive = greater past change, Negative = greater future change - -QUALITY ASSURANCE: - - QA checks for all 6 variables across all rows - - First 5 rows displayed with detailed calculations - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 13: datap 13 - ehi domain specific means.r -================================================================================ - -PURPOSE: - Calculates domain-level mean EHI scores by averaging the 5 items within each - domain (Preferences, Personality, Values) for each time interval. - -VARIABLES CREATED: 9 total (3 domains × 3 time intervals) - -SOURCE COLUMNS: - - ehi5_pref_read through ehi5_val_justice (15 columns) - - ehi10_pref_read through ehi10_val_justice (15 columns) - - ehi5.10_pref_read through ehi5.10_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year Domain Means (3 variables): - - ehi5_pref_MEAN (mean of 5 preference items) - - ehi5_pers_MEAN (mean of 5 personality items) - - ehi5_val_MEAN (mean of 5 values items) - - 10-Year Domain Means (3 variables): - - ehi10_pref_MEAN - - ehi10_pers_MEAN - - ehi10_val_MEAN - - 5-10 Year Change Domain Means (3 variables): - - ehi5.10_pref_MEAN - - ehi5.10_pers_MEAN - - ehi5.10_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for ehi5_pref_MEAN: - = mean(ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, - ehi5_pref_nap, ehi5_pref_travel) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - Comprehensive QA for all 9 variables across all rows - - First 5 rows displayed for multiple domain means - - Pass/Fail status for each variable - - -================================================================================ -SCRIPT 14: datap 14 - all ehi global means.r -================================================================================ - -PURPOSE: - Calculates global EHI means by averaging domain-level means. Creates the - highest-level summary scores for EHI across both domain-general and - domain-specific measures. - -VARIABLES CREATED: 5 total - -SOURCE COLUMNS: - - ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val - - ehiDGEN_10_Pref, ehiDGEN_10_Pers, ehiDGEN_10_Val - - ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN - - ehi10_pref_MEAN, ehi10_pers_MEAN, ehi10_val_MEAN - - ehi5.10_pref_MEAN, ehi5.10_pers_MEAN, ehi5.10_val_MEAN - -TARGET VARIABLES: - DGEN Global Means (2 variables): - - ehiDGEN_5_mean (mean of 3 DGEN domains for 5-year) - - ehiDGEN_10_mean (mean of 3 DGEN domains for 10-year) - - Domain-Specific Global Means (3 variables): - - ehi5_global_mean (mean of 3 domain means for 5-year) - - ehi10_global_mean (mean of 3 domain means for 10-year) - - ehi5.10_global_mean (mean of 3 domain means for 5-10 change) - -TRANSFORMATION LOGIC: - Each global mean = average of 3 domain-level scores - - Example for ehiDGEN_5_mean: - = mean(ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val) - - Example for ehi5_global_mean: - = mean(ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - QA for all 5 global means across all rows - - First 5 rows displayed with detailed calculations - - Values shown with 5 decimal precision - - Pass/Fail status for each variable - - -================================================================================ -SCRIPT 15: datap 15 - education recoded ordinal 3.r -================================================================================ - -PURPOSE: - Recodes raw education categories (`demo_edu`) into an ordered 3-level factor - for analyses requiring an ordinal education variable. - -VARIABLES CREATED: 1 total - -SOURCE COLUMNS: - - demo_edu - -TARGET VARIABLES: - - edu3 (ordered factor with 3 levels) - -TRANSFORMATION LOGIC: - Map `demo_edu` to 3 ordered levels and store as an ordered factor: - - "HS_TS": High School (or equivalent), Trade School (non-military) - - "C_Ug": College Diploma/Certificate, University - Undergraduate - - "grad_prof": University - Graduate (Masters), University - PhD, Professional Degree (ex. JD/MD) - - Levels and order: - edu3 = factor(edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) - -QUALITY ASSURANCE: - - Prints frequency table for `edu3` and a cross-tab of `demo_edu` × `edu3` to - verify correct mapping and absence of unintended NAs. - - Saves updated dataset to `eohi2.csv`. - - -================================================================================ -SCRIPT 16: datap 16 - ehi vars standardized .r -================================================================================ - -PURPOSE: - Standardizes key EHI summary variables (z-scores) and creates a composite - standardized EHI mean (`stdEHI_mean`) for use in correlational and regression - analyses. - -VARIABLES CREATED: 5 total - -SOURCE COLUMNS: - - ehiDGEN_5_mean, ehiDGEN_10_mean - - ehi5_global_mean, ehi10_global_mean - -TARGET VARIABLES: - - stdDGEN_5 = z(ehiDGEN_5_mean) - - stdDGEN_10 = z(ehiDGEN_10_mean) - - stdDS_5 = z(ehi5_global_mean) - - stdDS_10 = z(ehi10_global_mean) - - stdEHI_mean = mean(stdDGEN_5, stdDGEN_10, stdDS_5, stdDS_10), row-wise - -TRANSFORMATION LOGIC: - Standardize each source variable using sample mean and SD (na.rm = TRUE): - stdX = (X - mean(X)) / sd(X) - - Then compute row-wise average across the four standardized variables: - stdEHI_mean = rowMeans(cbind(stdDGEN_5, stdDGEN_10, stdDS_5, stdDS_10), - na.rm = TRUE) - -CHECKS/QA: - - Prints pre-standardization means/SDs and post-standardization means/SDs to - confirm ~0 mean and ~1 SD for each standardized variable (allowing for NAs). - - Spot-checks random rows by recomputing standardized values and comparing to - stored columns. - - Saves updated dataset to `eohi2.csv`. - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 291 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN time period means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - - Script 08: 6 variables (DGEN 5-vs-10 differences) - - Script 09: 11 variables (interval × direction means) - - Script 10: 6 variables (DGEN combined means) - - Script 11: 45 variables (domain-specific EHI scores) - - Script 12: 6 variables (DGEN EHI scores) - - Script 13: 9 variables (EHI domain means) - - Script 14: 5 variables (EHI global means) - - Script 15: 1 variable (education ordinal factor) - - Script 16: 5 variables (standardized EHI summaries and composite) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (28 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Time period means: 4 (1 per time period) - * 5-vs-10 differences: 6 (3 domains × 2 directions) - * Combined means: 6 (past, future, interval-based, domain-based) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - - Interval × Direction Means (11 total): - * Narrow-scope means: 6 (NPast_5, NPast_10, NFut_5, NFut_10, - X5.10past, X5.10fut) - * Global-scope means: 5 (NPast_global, NFut_global, X5.10_global, - N5_global, N10_global) - - - EHI Variables (60 total): - * Domain-specific EHI: 45 (15 items × 3 time intervals) - * DGEN EHI: 6 (3 domains × 2 time intervals) - * Domain means: 9 (3 domains × 3 time intervals) - * Global means: 5 (2 DGEN + 3 domain-specific) - - Standardized EHI Variables (5 total): - * stdDGEN_5, stdDGEN_10, stdDS_5, stdDS_10, stdEHI_mean - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 14) as later scripts depend - on variables created by earlier scripts. - - Key Dependencies: - - Script 03 required before Script 04, 08, 10, 12 (DGEN scores) - - Script 04 required before Script 10 (DGEN time period means) - - Script 06 required before Script 07, 09, 11 (time interval differences) - - Script 11 required before Script 13 (domain-specific EHI items) - - Script 12 required before Script 14 (DGEN EHI scores) - - Script 13 required before Script 14 (EHI domain means) - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - - Scripts 09-14 include comprehensive QA with first 5 rows displayed - - All EHI scripts (11-14) verify calculations against stored values - - Pass/Fail status reported for all variables in QA-enabled scripts - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Scripts 08 and 09 save directly to eohi2.csv - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - "X" prefix added to variables starting with numbers (X5.10past_mean) - - Global means use "_global_" to distinguish from narrow-scope means - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -EHI CONCEPT AND INTERPRETATION -================================================================================ - -ENDURING HEDONIC IMPACT (EHI): - EHI measures the asymmetry between perceived past and future change in - psychological attributes. The concept is based on the premise that people - may perceive their past and future selves differently, even when considering - equivalent time distances. - -KEY EHI VARIABLES: - - Domain-Specific EHI (Scripts 11, 13, 14): - Calculated from item-level differences between past and future responses - Formula: NPast - NFut - * Positive values: Greater perceived change from past to present - * Negative values: Greater perceived change from present to future - * Zero: Symmetric perception of past and future change - - - Domain-General EHI (Scripts 12, 14): - Calculated from DGEN single-item responses - Formula: DGEN_past - DGEN_fut - * Measures broader temporal self-perception without item-level detail - -HIERARCHICAL STRUCTURE: - Level 1: Item-level EHI (45 domain-specific, 6 DGEN) - Level 2: Domain means (9 domain-specific, combining 5 items each) - Level 3: Global means (5 highest-level summaries) - -INTERPRETATION: - - EHI > 0: "Past asymmetry" - Person perceives greater change from past - - EHI < 0: "Future asymmetry" - Person perceives greater change to future - - EHI ≈ 0: "Temporal symmetry" - Balanced perception of past/future change - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 8, 2025 -Processing Pipeline: Scripts 01-14 - diff --git a/.history/eohi2/README_Variable_Creation_20251029133342.txt b/.history/eohi2/README_Variable_Creation_20251029133342.txt deleted file mode 100644 index 2f3d00b..0000000 --- a/.history/eohi2/README_Variable_Creation_20251029133342.txt +++ /dev/null @@ -1,1047 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 09 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SCRIPT 08: dataP 08 - DGEN 510 vars.r -================================================================================ - -PURPOSE: - Calculates absolute differences between 5-year and 10-year DGEN ratings for - both Past and Future time directions. These variables measure the perceived - difference in domain-general change between the two time intervals. - -VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - Total: 12 DGEN columns (created in Script 03) - -TARGET VARIABLES: - Past Direction (3 variables): - - X5_10DGEN_past_pref (|DGEN_past_5_Pref - DGEN_past_10_Pref|) - - X5_10DGEN_past_pers (|DGEN_past_5_Pers - DGEN_past_10_Pers|) - - X5_10DGEN_past_val (|DGEN_past_5_Val - DGEN_past_10_Val|) - - Future Direction (3 variables): - - X5_10DGEN_fut_pref (|DGEN_fut_5_Pref - DGEN_fut_10_Pref|) - - X5_10DGEN_fut_pers (|DGEN_fut_5_Pers - DGEN_fut_10_Pers|) - - X5_10DGEN_fut_val (|DGEN_fut_5_Val - DGEN_fut_10_Val|) - -TRANSFORMATION LOGIC: - Formula: |DGEN_5 - DGEN_10| - - All calculations use absolute differences: - - Past Preferences: |DGEN_past_5_Pref - DGEN_past_10_Pref| - - Past Personality: |DGEN_past_5_Pers - DGEN_past_10_Pers| - - Past Values: |DGEN_past_5_Val - DGEN_past_10_Val| - - Future Preferences: |DGEN_fut_5_Pref - DGEN_fut_10_Pref| - - Future Personality: |DGEN_fut_5_Pers - DGEN_fut_10_Pers| - - Future Values: |DGEN_fut_5_Val - DGEN_fut_10_Val| - - Result: Always positive values representing magnitude of difference - Missing values in either source column result in NA - -SPECIAL NOTES: - - Variable names use "X" prefix because R automatically adds it to column - names starting with numbers (5_10 becomes X5_10) - - This script depends on Script 03 being run first - - Measures interval effects within time direction (past vs future) - - Parallel to Script 06's 5.10past and 5.10fut variables but for DGEN scores - - -================================================================================ -SCRIPT 09: dataP 09 - interval x direction means.r -================================================================================ - -PURPOSE: - Calculates comprehensive mean scores by averaging item-level differences - across intervals and directions. Creates both narrow-scope means (single - time interval) and broad-scope global means (combining multiple intervals). - -VARIABLES CREATED: 11 total (6 narrow-scope + 5 global-scope) - -SOURCE COLUMNS: - All 90 difference variables created in Script 06: - - NPast_5_[domain]_[item] (15 variables) - - NPast_10_[domain]_[item] (15 variables) - - NFut_5_[domain]_[item] (15 variables) - - NFut_10_[domain]_[item] (15 variables) - - X5.10past_[domain]_[item] (15 variables) - - X5.10fut_[domain]_[item] (15 variables) - -TARGET VARIABLES: - - Narrow-Scope Means (15 source items each): - - NPast_5_mean (mean across all 15 NPast_5 items) - - NPast_10_mean (mean across all 15 NPast_10 items) - - NFut_5_mean (mean across all 15 NFut_5 items) - - NFut_10_mean (mean across all 15 NFut_10 items) - - X5.10past_mean (mean across all 15 X5.10past items) - - X5.10fut_mean (mean across all 15 X5.10fut items) - - Global-Scope Means (30 source items each): - - NPast_global_mean (NPast_5 + NPast_10: all past intervals) - - NFut_global_mean (NFut_5 + NFut_10: all future intervals) - - X5.10_global_mean (X5.10past + X5.10fut: all 5-vs-10 intervals) - - N5_global_mean (NPast_5 + NFut_5: all 5-year intervals) - - N10_global_mean (NPast_10 + NFut_10: all 10-year intervals) - -TRANSFORMATION LOGIC: - - Narrow-Scope Means (15 items each): - Each mean averages all 15 difference items within one time interval - - Example for NPast_5_mean: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel, - NPast_5_pers_extravert, NPast_5_pers_critical, - NPast_5_pers_dependable, NPast_5_pers_anxious, - NPast_5_pers_complex, - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice) - - Global-Scope Means (30 items each): - Each mean averages 30 difference items across two related intervals - - Example for NPast_global_mean: - = mean(all 15 NPast_5 items + all 15 NPast_10 items) - Represents overall perceived change from present to any past timepoint - - Example for N5_global_mean: - = mean(all 15 NPast_5 items + all 15 NFut_5 items) - Represents overall perceived change at 5-year interval regardless of - direction - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF INTERVAL × DIRECTION MEANS: - - Narrow-scope means: Single-interval summaries across all domains and items - - Global-scope means: Cross-interval summaries for testing: - * Direction effects (past vs future) - * Interval effects (5-year vs 10-year) - * Combined temporal distance effects - - Enables comprehensive analysis of temporal self-perception patterns - - Reduces item-level and domain-level noise through broad aggregation - -QUALITY ASSURANCE: - - Script includes automated QA checks for first 5 rows - - Manually recalculates each mean and verifies against stored values - - Prints TRUE/FALSE match status for each variable - - Ensures calculation accuracy before further analysis - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - All means are averages of absolute difference scores (non-negative) - - Global means provide the broadest temporal self-perception summaries - - Naming convention uses "global" for 30-item means, no suffix for 15-item - - -================================================================================ -SCRIPT 10: dataP 10 - DGEN mean vars.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging across different time combinations. - Creates means for Past, Future, and interval-based (5-year, 10-year) groupings. - -VARIABLES CREATED: 6 total - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - Direction-Based Means (2 variables): - - DGEN_past_mean (mean of past_5_mean and past_10_mean) - - DGEN_fut_mean (mean of fut_5_mean and fut_10_mean) - - Interval-Based Means (2 variables): - - DGEN_5_mean (mean of past_5_mean and fut_5_mean) - - DGEN_10_mean (mean of past_10_mean and fut_10_mean) - - Domain-Based Means (2 variables): - - DGEN_pref_mean (mean across all 4 time periods for Preferences) - - DGEN_pers_mean (mean across all 4 time periods for Personality) - -TRANSFORMATION LOGIC: - Direction-based: - - DGEN_past_mean = mean(DGEN_past_5_mean, DGEN_past_10_mean) - - DGEN_fut_mean = mean(DGEN_fut_5_mean, DGEN_fut_10_mean) - - Interval-based: - - DGEN_5_mean = mean(DGEN_past_5_mean, DGEN_fut_5_mean) - - DGEN_10_mean = mean(DGEN_past_10_mean, DGEN_fut_10_mean) - - Domain-based: - - DGEN_pref_mean = mean across all 4 Pref scores - - DGEN_pers_mean = mean across all 4 Pers scores - - NA values excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 11: dataP 11 - CORRECT ehi vars.r -================================================================================ - -PURPOSE: - Creates Enduring Hedonic Impact (EHI) variables by calculating differences - between Past and Future responses for each item across different time intervals. - Formula: NPast - NFut (positive values indicate greater past-present change) - -VARIABLES CREATED: 45 total (15 items × 3 time intervals) - -SOURCE COLUMNS: - 5-year intervals: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - 10-year intervals: - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5-10 year change: - - X5.10past_pref_read through X5.10past_val_justice (15 columns) - - X5.10fut_pref_read through X5.10fut_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year EHI Variables (15 variables): - - ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, ehi5_pref_nap, - ehi5_pref_travel - - ehi5_pers_extravert, ehi5_pers_critical, ehi5_pers_dependable, - ehi5_pers_anxious, ehi5_pers_complex - - ehi5_val_obey, ehi5_val_trad, ehi5_val_opinion, ehi5_val_performance, - ehi5_val_justice - - 10-Year EHI Variables (15 variables): - - ehi10_pref_read, ehi10_pref_music, ehi10_pref_TV, ehi10_pref_nap, - ehi10_pref_travel - - ehi10_pers_extravert, ehi10_pers_critical, ehi10_pers_dependable, - ehi10_pers_anxious, ehi10_pers_complex - - ehi10_val_obey, ehi10_val_trad, ehi10_val_opinion, ehi10_val_performance, - ehi10_val_justice - - 5-10 Year Change EHI Variables (15 variables): - - ehi5.10_pref_read, ehi5.10_pref_music, ehi5.10_pref_TV, ehi5.10_pref_nap, - ehi5.10_pref_travel - - ehi5.10_pers_extravert, ehi5.10_pers_critical, ehi5.10_pers_dependable, - ehi5.10_pers_anxious, ehi5.10_pers_complex - - ehi5.10_val_obey, ehi5.10_val_trad, ehi5.10_val_opinion, - ehi5.10_val_performance, ehi5.10_val_justice - -TRANSFORMATION LOGIC: - Formula: NPast - NFut - - All calculations use signed differences: - - ehi5_[item] = NPast_5_[item] - NFut_5_[item] - - ehi10_[item] = NPast_10_[item] - NFut_10_[item] - - ehi5.10_[item] = X5.10past_[item] - X5.10fut_[item] - - Result: Positive = greater past change, Negative = greater future change - Missing values in either source column result in NA - -QUALITY ASSURANCE: - - Comprehensive QA checks for all 45 variables across all rows - - First 5 rows displayed with detailed calculations showing source values, - computed differences, and stored values - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 12: dataP 12 - CORRECT DGEN ehi vars.r -================================================================================ - -PURPOSE: - Creates domain-general EHI variables by calculating differences between Past - and Future DGEN responses. These are the domain-general parallel to Script 11's - domain-specific EHI variables. - -VARIABLES CREATED: 6 total (3 domains × 2 time intervals) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - 5-Year DGEN EHI (3 variables): - - ehiDGEN_5_Pref - - ehiDGEN_5_Pers - - ehiDGEN_5_Val - - 10-Year DGEN EHI (3 variables): - - ehiDGEN_10_Pref - - ehiDGEN_10_Pers - - ehiDGEN_10_Val - -TRANSFORMATION LOGIC: - Formula: DGEN_past - DGEN_fut - - All calculations use signed differences: - - ehiDGEN_5_Pref = DGEN_past_5_Pref - DGEN_fut_5_Pref - - ehiDGEN_5_Pers = DGEN_past_5_Pers - DGEN_fut_5_Pers - - ehiDGEN_5_Val = DGEN_past_5_Val - DGEN_fut_5_Val - - ehiDGEN_10_Pref = DGEN_past_10_Pref - DGEN_fut_10_Pref - - ehiDGEN_10_Pers = DGEN_past_10_Pers - DGEN_fut_10_Pers - - ehiDGEN_10_Val = DGEN_past_10_Val - DGEN_fut_10_Val - - Result: Positive = greater past change, Negative = greater future change - -QUALITY ASSURANCE: - - QA checks for all 6 variables across all rows - - First 5 rows displayed with detailed calculations - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 13: datap 13 - ehi domain specific means.r -================================================================================ - -PURPOSE: - Calculates domain-level mean EHI scores by averaging the 5 items within each - domain (Preferences, Personality, Values) for each time interval. - -VARIABLES CREATED: 9 total (3 domains × 3 time intervals) - -SOURCE COLUMNS: - - ehi5_pref_read through ehi5_val_justice (15 columns) - - ehi10_pref_read through ehi10_val_justice (15 columns) - - ehi5.10_pref_read through ehi5.10_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year Domain Means (3 variables): - - ehi5_pref_MEAN (mean of 5 preference items) - - ehi5_pers_MEAN (mean of 5 personality items) - - ehi5_val_MEAN (mean of 5 values items) - - 10-Year Domain Means (3 variables): - - ehi10_pref_MEAN - - ehi10_pers_MEAN - - ehi10_val_MEAN - - 5-10 Year Change Domain Means (3 variables): - - ehi5.10_pref_MEAN - - ehi5.10_pers_MEAN - - ehi5.10_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for ehi5_pref_MEAN: - = mean(ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, - ehi5_pref_nap, ehi5_pref_travel) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - Comprehensive QA for all 9 variables across all rows - - First 5 rows displayed for multiple domain means - - Pass/Fail status for each variable - - -================================================================================ -SCRIPT 14: datap 14 - all ehi global means.r -================================================================================ - -PURPOSE: - Calculates global EHI means by averaging domain-level means. Creates the - highest-level summary scores for EHI across both domain-general and - domain-specific measures. - -VARIABLES CREATED: 5 total - -SOURCE COLUMNS: - - ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val - - ehiDGEN_10_Pref, ehiDGEN_10_Pers, ehiDGEN_10_Val - - ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN - - ehi10_pref_MEAN, ehi10_pers_MEAN, ehi10_val_MEAN - - ehi5.10_pref_MEAN, ehi5.10_pers_MEAN, ehi5.10_val_MEAN - -TARGET VARIABLES: - DGEN Global Means (2 variables): - - ehiDGEN_5_mean (mean of 3 DGEN domains for 5-year) - - ehiDGEN_10_mean (mean of 3 DGEN domains for 10-year) - - Domain-Specific Global Means (3 variables): - - ehi5_global_mean (mean of 3 domain means for 5-year) - - ehi10_global_mean (mean of 3 domain means for 10-year) - - ehi5.10_global_mean (mean of 3 domain means for 5-10 change) - -TRANSFORMATION LOGIC: - Each global mean = average of 3 domain-level scores - - Example for ehiDGEN_5_mean: - = mean(ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val) - - Example for ehi5_global_mean: - = mean(ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - QA for all 5 global means across all rows - - First 5 rows displayed with detailed calculations - - Values shown with 5 decimal precision - - Pass/Fail status for each variable - - -================================================================================ -SCRIPT 15: datap 15 - education recoded ordinal 3.r -================================================================================ - -PURPOSE: - Recodes raw education categories (`demo_edu`) into an ordered 3-level factor - for analyses requiring an ordinal education variable. - -VARIABLES CREATED: 1 total - -SOURCE COLUMNS: - - demo_edu - -TARGET VARIABLES: - - edu3 (ordered factor with 3 levels) - -TRANSFORMATION LOGIC: - Map `demo_edu` to 3 ordered levels and store as an ordered factor: - - "HS_TS": High School (or equivalent), Trade School (non-military) - - "C_Ug": College Diploma/Certificate, University - Undergraduate - - "grad_prof": University - Graduate (Masters), University - PhD, Professional Degree (ex. JD/MD) - - Levels and order: - edu3 = factor(edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) - -QUALITY ASSURANCE: - - Prints frequency table for `edu3` and a cross-tab of `demo_edu` × `edu3` to - verify correct mapping and absence of unintended NAs. - - Saves updated dataset to `eohi2.csv`. - - -================================================================================ -SCRIPT 16: datap 16 - ehi vars standardized .r -================================================================================ - -PURPOSE: - Standardizes key EHI summary variables (z-scores) and creates a composite - standardized EHI mean (`stdEHI_mean`) for use in correlational and regression - analyses. - -VARIABLES CREATED: 5 total - -SOURCE COLUMNS: - - ehiDGEN_5_mean, ehiDGEN_10_mean - - ehi5_global_mean, ehi10_global_mean - -TARGET VARIABLES: - - stdDGEN_5 = z(ehiDGEN_5_mean) - - stdDGEN_10 = z(ehiDGEN_10_mean) - - stdDS_5 = z(ehi5_global_mean) - - stdDS_10 = z(ehi10_global_mean) - - stdEHI_mean = mean(stdDGEN_5, stdDGEN_10, stdDS_5, stdDS_10), row-wise - -TRANSFORMATION LOGIC: - Standardize each source variable using sample mean and SD (na.rm = TRUE): - stdX = (X - mean(X)) / sd(X) - - Then compute row-wise average across the four standardized variables: - stdEHI_mean = rowMeans(cbind(stdDGEN_5, stdDGEN_10, stdDS_5, stdDS_10), - na.rm = TRUE) - -CHECKS/QA: - - Prints pre-standardization means/SDs and post-standardization means/SDs to - confirm ~0 mean and ~1 SD for each standardized variable (allowing for NAs). - - Spot-checks random rows by recomputing standardized values and comparing to - stored columns. - - Saves updated dataset to `eohi2.csv`. - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 291 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN time period means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - - Script 08: 6 variables (DGEN 5-vs-10 differences) - - Script 09: 11 variables (interval × direction means) - - Script 10: 6 variables (DGEN combined means) - - Script 11: 45 variables (domain-specific EHI scores) - - Script 12: 6 variables (DGEN EHI scores) - - Script 13: 9 variables (EHI domain means) - - Script 14: 5 variables (EHI global means) - - Script 15: 1 variable (education ordinal factor) - - Script 16: 5 variables (standardized EHI summaries and composite) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (28 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Time period means: 4 (1 per time period) - * 5-vs-10 differences: 6 (3 domains × 2 directions) - * Combined means: 6 (past, future, interval-based, domain-based) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - - Interval × Direction Means (11 total): - * Narrow-scope means: 6 (NPast_5, NPast_10, NFut_5, NFut_10, - X5.10past, X5.10fut) - * Global-scope means: 5 (NPast_global, NFut_global, X5.10_global, - N5_global, N10_global) - - - EHI Variables (60 total): - * Domain-specific EHI: 45 (15 items × 3 time intervals) - * DGEN EHI: 6 (3 domains × 2 time intervals) - * Domain means: 9 (3 domains × 3 time intervals) - * Global means: 5 (2 DGEN + 3 domain-specific) - - Standardized EHI Variables (5 total): - * stdDGEN_5, stdDGEN_10, stdDS_5, stdDS_10, stdEHI_mean - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 16) as later scripts depend - on variables created by earlier scripts. - - Key Dependencies: - - Script 03 required before Script 04, 08, 10, 12 (DGEN scores) - - Script 04 required before Script 10 (DGEN time period means) - - Script 06 required before Script 07, 09, 11 (time interval differences) - - Script 11 required before Script 13 (domain-specific EHI items) - - Script 12 required before Script 14 (DGEN EHI scores) - - Script 13 required before Script 14 (EHI domain means) - - Script 14 required before Script 16 (uses ehiDGEN_5/10_mean, ehi5/10_global_mean) - - Script 15 can run anytime after raw `demo_edu` is present; run before - analyses needing `edu3` - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - - Scripts 09-16 include comprehensive QA with first 5 rows displayed - - All EHI scripts (11-14, 16) verify calculations against stored values - - Pass/Fail status reported for all variables in QA-enabled scripts - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Scripts 08 and 09 save directly to eohi2.csv - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - "X" prefix added to variables starting with numbers (X5.10past_mean) - - Global means use "_global_" to distinguish from narrow-scope means - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -EHI CONCEPT AND INTERPRETATION -================================================================================ - -ENDURING HEDONIC IMPACT (EHI): - EHI measures the asymmetry between perceived past and future change in - psychological attributes. The concept is based on the premise that people - may perceive their past and future selves differently, even when considering - equivalent time distances. - -KEY EHI VARIABLES: - - Domain-Specific EHI (Scripts 11, 13, 14): - Calculated from item-level differences between past and future responses - Formula: NPast - NFut - * Positive values: Greater perceived change from past to present - * Negative values: Greater perceived change from present to future - * Zero: Symmetric perception of past and future change - - - Domain-General EHI (Scripts 12, 14): - Calculated from DGEN single-item responses - Formula: DGEN_past - DGEN_fut - * Measures broader temporal self-perception without item-level detail - -HIERARCHICAL STRUCTURE: - Level 1: Item-level EHI (45 domain-specific, 6 DGEN) - Level 2: Domain means (9 domain-specific, combining 5 items each) - Level 3: Global means (5 highest-level summaries) - -INTERPRETATION: - - EHI > 0: "Past asymmetry" - Person perceives greater change from past - - EHI < 0: "Future asymmetry" - Person perceives greater change to future - - EHI ≈ 0: "Temporal symmetry" - Balanced perception of past/future change - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 8, 2025 -Processing Pipeline: Scripts 01-14 - diff --git a/.history/eohi2/README_Variable_Creation_20251029133348.txt b/.history/eohi2/README_Variable_Creation_20251029133348.txt deleted file mode 100644 index 971bcf1..0000000 --- a/.history/eohi2/README_Variable_Creation_20251029133348.txt +++ /dev/null @@ -1,1047 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through dataP 09 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SCRIPT 08: dataP 08 - DGEN 510 vars.r -================================================================================ - -PURPOSE: - Calculates absolute differences between 5-year and 10-year DGEN ratings for - both Past and Future time directions. These variables measure the perceived - difference in domain-general change between the two time intervals. - -VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - Total: 12 DGEN columns (created in Script 03) - -TARGET VARIABLES: - Past Direction (3 variables): - - X5_10DGEN_past_pref (|DGEN_past_5_Pref - DGEN_past_10_Pref|) - - X5_10DGEN_past_pers (|DGEN_past_5_Pers - DGEN_past_10_Pers|) - - X5_10DGEN_past_val (|DGEN_past_5_Val - DGEN_past_10_Val|) - - Future Direction (3 variables): - - X5_10DGEN_fut_pref (|DGEN_fut_5_Pref - DGEN_fut_10_Pref|) - - X5_10DGEN_fut_pers (|DGEN_fut_5_Pers - DGEN_fut_10_Pers|) - - X5_10DGEN_fut_val (|DGEN_fut_5_Val - DGEN_fut_10_Val|) - -TRANSFORMATION LOGIC: - Formula: |DGEN_5 - DGEN_10| - - All calculations use absolute differences: - - Past Preferences: |DGEN_past_5_Pref - DGEN_past_10_Pref| - - Past Personality: |DGEN_past_5_Pers - DGEN_past_10_Pers| - - Past Values: |DGEN_past_5_Val - DGEN_past_10_Val| - - Future Preferences: |DGEN_fut_5_Pref - DGEN_fut_10_Pref| - - Future Personality: |DGEN_fut_5_Pers - DGEN_fut_10_Pers| - - Future Values: |DGEN_fut_5_Val - DGEN_fut_10_Val| - - Result: Always positive values representing magnitude of difference - Missing values in either source column result in NA - -SPECIAL NOTES: - - Variable names use "X" prefix because R automatically adds it to column - names starting with numbers (5_10 becomes X5_10) - - This script depends on Script 03 being run first - - Measures interval effects within time direction (past vs future) - - Parallel to Script 06's 5.10past and 5.10fut variables but for DGEN scores - - -================================================================================ -SCRIPT 09: dataP 09 - interval x direction means.r -================================================================================ - -PURPOSE: - Calculates comprehensive mean scores by averaging item-level differences - across intervals and directions. Creates both narrow-scope means (single - time interval) and broad-scope global means (combining multiple intervals). - -VARIABLES CREATED: 11 total (6 narrow-scope + 5 global-scope) - -SOURCE COLUMNS: - All 90 difference variables created in Script 06: - - NPast_5_[domain]_[item] (15 variables) - - NPast_10_[domain]_[item] (15 variables) - - NFut_5_[domain]_[item] (15 variables) - - NFut_10_[domain]_[item] (15 variables) - - X5.10past_[domain]_[item] (15 variables) - - X5.10fut_[domain]_[item] (15 variables) - -TARGET VARIABLES: - - Narrow-Scope Means (15 source items each): - - NPast_5_mean (mean across all 15 NPast_5 items) - - NPast_10_mean (mean across all 15 NPast_10 items) - - NFut_5_mean (mean across all 15 NFut_5 items) - - NFut_10_mean (mean across all 15 NFut_10 items) - - X5.10past_mean (mean across all 15 X5.10past items) - - X5.10fut_mean (mean across all 15 X5.10fut items) - - Global-Scope Means (30 source items each): - - NPast_global_mean (NPast_5 + NPast_10: all past intervals) - - NFut_global_mean (NFut_5 + NFut_10: all future intervals) - - X5.10_global_mean (X5.10past + X5.10fut: all 5-vs-10 intervals) - - N5_global_mean (NPast_5 + NFut_5: all 5-year intervals) - - N10_global_mean (NPast_10 + NFut_10: all 10-year intervals) - -TRANSFORMATION LOGIC: - - Narrow-Scope Means (15 items each): - Each mean averages all 15 difference items within one time interval - - Example for NPast_5_mean: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel, - NPast_5_pers_extravert, NPast_5_pers_critical, - NPast_5_pers_dependable, NPast_5_pers_anxious, - NPast_5_pers_complex, - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice) - - Global-Scope Means (30 items each): - Each mean averages 30 difference items across two related intervals - - Example for NPast_global_mean: - = mean(all 15 NPast_5 items + all 15 NPast_10 items) - Represents overall perceived change from present to any past timepoint - - Example for N5_global_mean: - = mean(all 15 NPast_5 items + all 15 NFut_5 items) - Represents overall perceived change at 5-year interval regardless of - direction - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF INTERVAL × DIRECTION MEANS: - - Narrow-scope means: Single-interval summaries across all domains and items - - Global-scope means: Cross-interval summaries for testing: - * Direction effects (past vs future) - * Interval effects (5-year vs 10-year) - * Combined temporal distance effects - - Enables comprehensive analysis of temporal self-perception patterns - - Reduces item-level and domain-level noise through broad aggregation - -QUALITY ASSURANCE: - - Script includes automated QA checks for first 5 rows - - Manually recalculates each mean and verifies against stored values - - Prints TRUE/FALSE match status for each variable - - Ensures calculation accuracy before further analysis - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - All means are averages of absolute difference scores (non-negative) - - Global means provide the broadest temporal self-perception summaries - - Naming convention uses "global" for 30-item means, no suffix for 15-item - - -================================================================================ -SCRIPT 10: dataP 10 - DGEN mean vars.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging across different time combinations. - Creates means for Past, Future, and interval-based (5-year, 10-year) groupings. - -VARIABLES CREATED: 6 total - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - Direction-Based Means (2 variables): - - DGEN_past_mean (mean of past_5_mean and past_10_mean) - - DGEN_fut_mean (mean of fut_5_mean and fut_10_mean) - - Interval-Based Means (2 variables): - - DGEN_5_mean (mean of past_5_mean and fut_5_mean) - - DGEN_10_mean (mean of past_10_mean and fut_10_mean) - - Domain-Based Means (2 variables): - - DGEN_pref_mean (mean across all 4 time periods for Preferences) - - DGEN_pers_mean (mean across all 4 time periods for Personality) - -TRANSFORMATION LOGIC: - Direction-based: - - DGEN_past_mean = mean(DGEN_past_5_mean, DGEN_past_10_mean) - - DGEN_fut_mean = mean(DGEN_fut_5_mean, DGEN_fut_10_mean) - - Interval-based: - - DGEN_5_mean = mean(DGEN_past_5_mean, DGEN_fut_5_mean) - - DGEN_10_mean = mean(DGEN_past_10_mean, DGEN_fut_10_mean) - - Domain-based: - - DGEN_pref_mean = mean across all 4 Pref scores - - DGEN_pers_mean = mean across all 4 Pers scores - - NA values excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 11: dataP 11 - CORRECT ehi vars.r -================================================================================ - -PURPOSE: - Creates Enduring Hedonic Impact (EHI) variables by calculating differences - between Past and Future responses for each item across different time intervals. - Formula: NPast - NFut (positive values indicate greater past-present change) - -VARIABLES CREATED: 45 total (15 items × 3 time intervals) - -SOURCE COLUMNS: - 5-year intervals: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - 10-year intervals: - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5-10 year change: - - X5.10past_pref_read through X5.10past_val_justice (15 columns) - - X5.10fut_pref_read through X5.10fut_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year EHI Variables (15 variables): - - ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, ehi5_pref_nap, - ehi5_pref_travel - - ehi5_pers_extravert, ehi5_pers_critical, ehi5_pers_dependable, - ehi5_pers_anxious, ehi5_pers_complex - - ehi5_val_obey, ehi5_val_trad, ehi5_val_opinion, ehi5_val_performance, - ehi5_val_justice - - 10-Year EHI Variables (15 variables): - - ehi10_pref_read, ehi10_pref_music, ehi10_pref_TV, ehi10_pref_nap, - ehi10_pref_travel - - ehi10_pers_extravert, ehi10_pers_critical, ehi10_pers_dependable, - ehi10_pers_anxious, ehi10_pers_complex - - ehi10_val_obey, ehi10_val_trad, ehi10_val_opinion, ehi10_val_performance, - ehi10_val_justice - - 5-10 Year Change EHI Variables (15 variables): - - ehi5.10_pref_read, ehi5.10_pref_music, ehi5.10_pref_TV, ehi5.10_pref_nap, - ehi5.10_pref_travel - - ehi5.10_pers_extravert, ehi5.10_pers_critical, ehi5.10_pers_dependable, - ehi5.10_pers_anxious, ehi5.10_pers_complex - - ehi5.10_val_obey, ehi5.10_val_trad, ehi5.10_val_opinion, - ehi5.10_val_performance, ehi5.10_val_justice - -TRANSFORMATION LOGIC: - Formula: NPast - NFut - - All calculations use signed differences: - - ehi5_[item] = NPast_5_[item] - NFut_5_[item] - - ehi10_[item] = NPast_10_[item] - NFut_10_[item] - - ehi5.10_[item] = X5.10past_[item] - X5.10fut_[item] - - Result: Positive = greater past change, Negative = greater future change - Missing values in either source column result in NA - -QUALITY ASSURANCE: - - Comprehensive QA checks for all 45 variables across all rows - - First 5 rows displayed with detailed calculations showing source values, - computed differences, and stored values - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 12: dataP 12 - CORRECT DGEN ehi vars.r -================================================================================ - -PURPOSE: - Creates domain-general EHI variables by calculating differences between Past - and Future DGEN responses. These are the domain-general parallel to Script 11's - domain-specific EHI variables. - -VARIABLES CREATED: 6 total (3 domains × 2 time intervals) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - 5-Year DGEN EHI (3 variables): - - ehiDGEN_5_Pref - - ehiDGEN_5_Pers - - ehiDGEN_5_Val - - 10-Year DGEN EHI (3 variables): - - ehiDGEN_10_Pref - - ehiDGEN_10_Pers - - ehiDGEN_10_Val - -TRANSFORMATION LOGIC: - Formula: DGEN_past - DGEN_fut - - All calculations use signed differences: - - ehiDGEN_5_Pref = DGEN_past_5_Pref - DGEN_fut_5_Pref - - ehiDGEN_5_Pers = DGEN_past_5_Pers - DGEN_fut_5_Pers - - ehiDGEN_5_Val = DGEN_past_5_Val - DGEN_fut_5_Val - - ehiDGEN_10_Pref = DGEN_past_10_Pref - DGEN_fut_10_Pref - - ehiDGEN_10_Pers = DGEN_past_10_Pers - DGEN_fut_10_Pers - - ehiDGEN_10_Val = DGEN_past_10_Val - DGEN_fut_10_Val - - Result: Positive = greater past change, Negative = greater future change - -QUALITY ASSURANCE: - - QA checks for all 6 variables across all rows - - First 5 rows displayed with detailed calculations - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 13: datap 13 - ehi domain specific means.r -================================================================================ - -PURPOSE: - Calculates domain-level mean EHI scores by averaging the 5 items within each - domain (Preferences, Personality, Values) for each time interval. - -VARIABLES CREATED: 9 total (3 domains × 3 time intervals) - -SOURCE COLUMNS: - - ehi5_pref_read through ehi5_val_justice (15 columns) - - ehi10_pref_read through ehi10_val_justice (15 columns) - - ehi5.10_pref_read through ehi5.10_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year Domain Means (3 variables): - - ehi5_pref_MEAN (mean of 5 preference items) - - ehi5_pers_MEAN (mean of 5 personality items) - - ehi5_val_MEAN (mean of 5 values items) - - 10-Year Domain Means (3 variables): - - ehi10_pref_MEAN - - ehi10_pers_MEAN - - ehi10_val_MEAN - - 5-10 Year Change Domain Means (3 variables): - - ehi5.10_pref_MEAN - - ehi5.10_pers_MEAN - - ehi5.10_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for ehi5_pref_MEAN: - = mean(ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, - ehi5_pref_nap, ehi5_pref_travel) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - Comprehensive QA for all 9 variables across all rows - - First 5 rows displayed for multiple domain means - - Pass/Fail status for each variable - - -================================================================================ -SCRIPT 14: datap 14 - all ehi global means.r -================================================================================ - -PURPOSE: - Calculates global EHI means by averaging domain-level means. Creates the - highest-level summary scores for EHI across both domain-general and - domain-specific measures. - -VARIABLES CREATED: 5 total - -SOURCE COLUMNS: - - ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val - - ehiDGEN_10_Pref, ehiDGEN_10_Pers, ehiDGEN_10_Val - - ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN - - ehi10_pref_MEAN, ehi10_pers_MEAN, ehi10_val_MEAN - - ehi5.10_pref_MEAN, ehi5.10_pers_MEAN, ehi5.10_val_MEAN - -TARGET VARIABLES: - DGEN Global Means (2 variables): - - ehiDGEN_5_mean (mean of 3 DGEN domains for 5-year) - - ehiDGEN_10_mean (mean of 3 DGEN domains for 10-year) - - Domain-Specific Global Means (3 variables): - - ehi5_global_mean (mean of 3 domain means for 5-year) - - ehi10_global_mean (mean of 3 domain means for 10-year) - - ehi5.10_global_mean (mean of 3 domain means for 5-10 change) - -TRANSFORMATION LOGIC: - Each global mean = average of 3 domain-level scores - - Example for ehiDGEN_5_mean: - = mean(ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val) - - Example for ehi5_global_mean: - = mean(ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - QA for all 5 global means across all rows - - First 5 rows displayed with detailed calculations - - Values shown with 5 decimal precision - - Pass/Fail status for each variable - - -================================================================================ -SCRIPT 15: datap 15 - education recoded ordinal 3.r -================================================================================ - -PURPOSE: - Recodes raw education categories (`demo_edu`) into an ordered 3-level factor - for analyses requiring an ordinal education variable. - -VARIABLES CREATED: 1 total - -SOURCE COLUMNS: - - demo_edu - -TARGET VARIABLES: - - edu3 (ordered factor with 3 levels) - -TRANSFORMATION LOGIC: - Map `demo_edu` to 3 ordered levels and store as an ordered factor: - - "HS_TS": High School (or equivalent), Trade School (non-military) - - "C_Ug": College Diploma/Certificate, University - Undergraduate - - "grad_prof": University - Graduate (Masters), University - PhD, Professional Degree (ex. JD/MD) - - Levels and order: - edu3 = factor(edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) - -QUALITY ASSURANCE: - - Prints frequency table for `edu3` and a cross-tab of `demo_edu` × `edu3` to - verify correct mapping and absence of unintended NAs. - - Saves updated dataset to `eohi2.csv`. - - -================================================================================ -SCRIPT 16: datap 16 - ehi vars standardized .r -================================================================================ - -PURPOSE: - Standardizes key EHI summary variables (z-scores) and creates a composite - standardized EHI mean (`stdEHI_mean`) for use in correlational and regression - analyses. - -VARIABLES CREATED: 5 total - -SOURCE COLUMNS: - - ehiDGEN_5_mean, ehiDGEN_10_mean - - ehi5_global_mean, ehi10_global_mean - -TARGET VARIABLES: - - stdDGEN_5 = z(ehiDGEN_5_mean) - - stdDGEN_10 = z(ehiDGEN_10_mean) - - stdDS_5 = z(ehi5_global_mean) - - stdDS_10 = z(ehi10_global_mean) - - stdEHI_mean = mean(stdDGEN_5, stdDGEN_10, stdDS_5, stdDS_10), row-wise - -TRANSFORMATION LOGIC: - Standardize each source variable using sample mean and SD (na.rm = TRUE): - stdX = (X - mean(X)) / sd(X) - - Then compute row-wise average across the four standardized variables: - stdEHI_mean = rowMeans(cbind(stdDGEN_5, stdDGEN_10, stdDS_5, stdDS_10), - na.rm = TRUE) - -CHECKS/QA: - - Prints pre-standardization means/SDs and post-standardization means/SDs to - confirm ~0 mean and ~1 SD for each standardized variable (allowing for NAs). - - Spot-checks random rows by recomputing standardized values and comparing to - stored columns. - - Saves updated dataset to `eohi2.csv`. - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 291 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN time period means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - - Script 08: 6 variables (DGEN 5-vs-10 differences) - - Script 09: 11 variables (interval × direction means) - - Script 10: 6 variables (DGEN combined means) - - Script 11: 45 variables (domain-specific EHI scores) - - Script 12: 6 variables (DGEN EHI scores) - - Script 13: 9 variables (EHI domain means) - - Script 14: 5 variables (EHI global means) - - Script 15: 1 variable (education ordinal factor) - - Script 16: 5 variables (standardized EHI summaries and composite) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (28 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Time period means: 4 (1 per time period) - * 5-vs-10 differences: 6 (3 domains × 2 directions) - * Combined means: 6 (past, future, interval-based, domain-based) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - - Interval × Direction Means (11 total): - * Narrow-scope means: 6 (NPast_5, NPast_10, NFut_5, NFut_10, - X5.10past, X5.10fut) - * Global-scope means: 5 (NPast_global, NFut_global, X5.10_global, - N5_global, N10_global) - - - EHI Variables (60 total): - * Domain-specific EHI: 45 (15 items × 3 time intervals) - * DGEN EHI: 6 (3 domains × 2 time intervals) - * Domain means: 9 (3 domains × 3 time intervals) - * Global means: 5 (2 DGEN + 3 domain-specific) - - Standardized EHI Variables (5 total): - * stdDGEN_5, stdDGEN_10, stdDS_5, stdDS_10, stdEHI_mean - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 16) as later scripts depend - on variables created by earlier scripts. - - Key Dependencies: - - Script 03 required before Script 04, 08, 10, 12 (DGEN scores) - - Script 04 required before Script 10 (DGEN time period means) - - Script 06 required before Script 07, 09, 11 (time interval differences) - - Script 11 required before Script 13 (domain-specific EHI items) - - Script 12 required before Script 14 (DGEN EHI scores) - - Script 13 required before Script 14 (EHI domain means) - - Script 14 required before Script 16 (uses ehiDGEN_5/10_mean, ehi5/10_global_mean) - - Script 15 can run anytime after raw `demo_edu` is present; run before - analyses needing `edu3` - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - - Scripts 09-16 include comprehensive QA with first 5 rows displayed - - All EHI scripts (11-14, 16) verify calculations against stored values - - Pass/Fail status reported for all variables in QA-enabled scripts - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Scripts 08 and 09 save directly to eohi2.csv - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - "X" prefix added to variables starting with numbers (X5.10past_mean) - - Global means use "_global_" to distinguish from narrow-scope means - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -EHI CONCEPT AND INTERPRETATION -================================================================================ - -ENDURING HEDONIC IMPACT (EHI): - EHI measures the asymmetry between perceived past and future change in - psychological attributes. The concept is based on the premise that people - may perceive their past and future selves differently, even when considering - equivalent time distances. - -KEY EHI VARIABLES: - - Domain-Specific EHI (Scripts 11, 13, 14): - Calculated from item-level differences between past and future responses - Formula: NPast - NFut - * Positive values: Greater perceived change from past to present - * Negative values: Greater perceived change from present to future - * Zero: Symmetric perception of past and future change - - - Domain-General EHI (Scripts 12, 14): - Calculated from DGEN single-item responses - Formula: DGEN_past - DGEN_fut - * Measures broader temporal self-perception without item-level detail - -HIERARCHICAL STRUCTURE: - Level 1: Item-level EHI (45 domain-specific, 6 DGEN) - Level 2: Domain means (9 domain-specific, combining 5 items each) - Level 3: Global means (5 highest-level summaries) - -INTERPRETATION: - - EHI > 0: "Past asymmetry" - Person perceives greater change from past - - EHI < 0: "Future asymmetry" - Person perceives greater change to future - - EHI ≈ 0: "Temporal symmetry" - Balanced perception of past/future change - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 29, 2025 -Processing Pipeline: Scripts 01-16 - diff --git a/.history/eohi2/README_Variable_Creation_20251029133355.txt b/.history/eohi2/README_Variable_Creation_20251029133355.txt deleted file mode 100644 index e5821a2..0000000 --- a/.history/eohi2/README_Variable_Creation_20251029133355.txt +++ /dev/null @@ -1,1047 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through datap 16 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SCRIPT 08: dataP 08 - DGEN 510 vars.r -================================================================================ - -PURPOSE: - Calculates absolute differences between 5-year and 10-year DGEN ratings for - both Past and Future time directions. These variables measure the perceived - difference in domain-general change between the two time intervals. - -VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - Total: 12 DGEN columns (created in Script 03) - -TARGET VARIABLES: - Past Direction (3 variables): - - X5_10DGEN_past_pref (|DGEN_past_5_Pref - DGEN_past_10_Pref|) - - X5_10DGEN_past_pers (|DGEN_past_5_Pers - DGEN_past_10_Pers|) - - X5_10DGEN_past_val (|DGEN_past_5_Val - DGEN_past_10_Val|) - - Future Direction (3 variables): - - X5_10DGEN_fut_pref (|DGEN_fut_5_Pref - DGEN_fut_10_Pref|) - - X5_10DGEN_fut_pers (|DGEN_fut_5_Pers - DGEN_fut_10_Pers|) - - X5_10DGEN_fut_val (|DGEN_fut_5_Val - DGEN_fut_10_Val|) - -TRANSFORMATION LOGIC: - Formula: |DGEN_5 - DGEN_10| - - All calculations use absolute differences: - - Past Preferences: |DGEN_past_5_Pref - DGEN_past_10_Pref| - - Past Personality: |DGEN_past_5_Pers - DGEN_past_10_Pers| - - Past Values: |DGEN_past_5_Val - DGEN_past_10_Val| - - Future Preferences: |DGEN_fut_5_Pref - DGEN_fut_10_Pref| - - Future Personality: |DGEN_fut_5_Pers - DGEN_fut_10_Pers| - - Future Values: |DGEN_fut_5_Val - DGEN_fut_10_Val| - - Result: Always positive values representing magnitude of difference - Missing values in either source column result in NA - -SPECIAL NOTES: - - Variable names use "X" prefix because R automatically adds it to column - names starting with numbers (5_10 becomes X5_10) - - This script depends on Script 03 being run first - - Measures interval effects within time direction (past vs future) - - Parallel to Script 06's 5.10past and 5.10fut variables but for DGEN scores - - -================================================================================ -SCRIPT 09: dataP 09 - interval x direction means.r -================================================================================ - -PURPOSE: - Calculates comprehensive mean scores by averaging item-level differences - across intervals and directions. Creates both narrow-scope means (single - time interval) and broad-scope global means (combining multiple intervals). - -VARIABLES CREATED: 11 total (6 narrow-scope + 5 global-scope) - -SOURCE COLUMNS: - All 90 difference variables created in Script 06: - - NPast_5_[domain]_[item] (15 variables) - - NPast_10_[domain]_[item] (15 variables) - - NFut_5_[domain]_[item] (15 variables) - - NFut_10_[domain]_[item] (15 variables) - - X5.10past_[domain]_[item] (15 variables) - - X5.10fut_[domain]_[item] (15 variables) - -TARGET VARIABLES: - - Narrow-Scope Means (15 source items each): - - NPast_5_mean (mean across all 15 NPast_5 items) - - NPast_10_mean (mean across all 15 NPast_10 items) - - NFut_5_mean (mean across all 15 NFut_5 items) - - NFut_10_mean (mean across all 15 NFut_10 items) - - X5.10past_mean (mean across all 15 X5.10past items) - - X5.10fut_mean (mean across all 15 X5.10fut items) - - Global-Scope Means (30 source items each): - - NPast_global_mean (NPast_5 + NPast_10: all past intervals) - - NFut_global_mean (NFut_5 + NFut_10: all future intervals) - - X5.10_global_mean (X5.10past + X5.10fut: all 5-vs-10 intervals) - - N5_global_mean (NPast_5 + NFut_5: all 5-year intervals) - - N10_global_mean (NPast_10 + NFut_10: all 10-year intervals) - -TRANSFORMATION LOGIC: - - Narrow-Scope Means (15 items each): - Each mean averages all 15 difference items within one time interval - - Example for NPast_5_mean: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel, - NPast_5_pers_extravert, NPast_5_pers_critical, - NPast_5_pers_dependable, NPast_5_pers_anxious, - NPast_5_pers_complex, - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice) - - Global-Scope Means (30 items each): - Each mean averages 30 difference items across two related intervals - - Example for NPast_global_mean: - = mean(all 15 NPast_5 items + all 15 NPast_10 items) - Represents overall perceived change from present to any past timepoint - - Example for N5_global_mean: - = mean(all 15 NPast_5 items + all 15 NFut_5 items) - Represents overall perceived change at 5-year interval regardless of - direction - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF INTERVAL × DIRECTION MEANS: - - Narrow-scope means: Single-interval summaries across all domains and items - - Global-scope means: Cross-interval summaries for testing: - * Direction effects (past vs future) - * Interval effects (5-year vs 10-year) - * Combined temporal distance effects - - Enables comprehensive analysis of temporal self-perception patterns - - Reduces item-level and domain-level noise through broad aggregation - -QUALITY ASSURANCE: - - Script includes automated QA checks for first 5 rows - - Manually recalculates each mean and verifies against stored values - - Prints TRUE/FALSE match status for each variable - - Ensures calculation accuracy before further analysis - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - All means are averages of absolute difference scores (non-negative) - - Global means provide the broadest temporal self-perception summaries - - Naming convention uses "global" for 30-item means, no suffix for 15-item - - -================================================================================ -SCRIPT 10: dataP 10 - DGEN mean vars.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging across different time combinations. - Creates means for Past, Future, and interval-based (5-year, 10-year) groupings. - -VARIABLES CREATED: 6 total - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - Direction-Based Means (2 variables): - - DGEN_past_mean (mean of past_5_mean and past_10_mean) - - DGEN_fut_mean (mean of fut_5_mean and fut_10_mean) - - Interval-Based Means (2 variables): - - DGEN_5_mean (mean of past_5_mean and fut_5_mean) - - DGEN_10_mean (mean of past_10_mean and fut_10_mean) - - Domain-Based Means (2 variables): - - DGEN_pref_mean (mean across all 4 time periods for Preferences) - - DGEN_pers_mean (mean across all 4 time periods for Personality) - -TRANSFORMATION LOGIC: - Direction-based: - - DGEN_past_mean = mean(DGEN_past_5_mean, DGEN_past_10_mean) - - DGEN_fut_mean = mean(DGEN_fut_5_mean, DGEN_fut_10_mean) - - Interval-based: - - DGEN_5_mean = mean(DGEN_past_5_mean, DGEN_fut_5_mean) - - DGEN_10_mean = mean(DGEN_past_10_mean, DGEN_fut_10_mean) - - Domain-based: - - DGEN_pref_mean = mean across all 4 Pref scores - - DGEN_pers_mean = mean across all 4 Pers scores - - NA values excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 11: dataP 11 - CORRECT ehi vars.r -================================================================================ - -PURPOSE: - Creates Enduring Hedonic Impact (EHI) variables by calculating differences - between Past and Future responses for each item across different time intervals. - Formula: NPast - NFut (positive values indicate greater past-present change) - -VARIABLES CREATED: 45 total (15 items × 3 time intervals) - -SOURCE COLUMNS: - 5-year intervals: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - 10-year intervals: - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5-10 year change: - - X5.10past_pref_read through X5.10past_val_justice (15 columns) - - X5.10fut_pref_read through X5.10fut_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year EHI Variables (15 variables): - - ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, ehi5_pref_nap, - ehi5_pref_travel - - ehi5_pers_extravert, ehi5_pers_critical, ehi5_pers_dependable, - ehi5_pers_anxious, ehi5_pers_complex - - ehi5_val_obey, ehi5_val_trad, ehi5_val_opinion, ehi5_val_performance, - ehi5_val_justice - - 10-Year EHI Variables (15 variables): - - ehi10_pref_read, ehi10_pref_music, ehi10_pref_TV, ehi10_pref_nap, - ehi10_pref_travel - - ehi10_pers_extravert, ehi10_pers_critical, ehi10_pers_dependable, - ehi10_pers_anxious, ehi10_pers_complex - - ehi10_val_obey, ehi10_val_trad, ehi10_val_opinion, ehi10_val_performance, - ehi10_val_justice - - 5-10 Year Change EHI Variables (15 variables): - - ehi5.10_pref_read, ehi5.10_pref_music, ehi5.10_pref_TV, ehi5.10_pref_nap, - ehi5.10_pref_travel - - ehi5.10_pers_extravert, ehi5.10_pers_critical, ehi5.10_pers_dependable, - ehi5.10_pers_anxious, ehi5.10_pers_complex - - ehi5.10_val_obey, ehi5.10_val_trad, ehi5.10_val_opinion, - ehi5.10_val_performance, ehi5.10_val_justice - -TRANSFORMATION LOGIC: - Formula: NPast - NFut - - All calculations use signed differences: - - ehi5_[item] = NPast_5_[item] - NFut_5_[item] - - ehi10_[item] = NPast_10_[item] - NFut_10_[item] - - ehi5.10_[item] = X5.10past_[item] - X5.10fut_[item] - - Result: Positive = greater past change, Negative = greater future change - Missing values in either source column result in NA - -QUALITY ASSURANCE: - - Comprehensive QA checks for all 45 variables across all rows - - First 5 rows displayed with detailed calculations showing source values, - computed differences, and stored values - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 12: dataP 12 - CORRECT DGEN ehi vars.r -================================================================================ - -PURPOSE: - Creates domain-general EHI variables by calculating differences between Past - and Future DGEN responses. These are the domain-general parallel to Script 11's - domain-specific EHI variables. - -VARIABLES CREATED: 6 total (3 domains × 2 time intervals) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - 5-Year DGEN EHI (3 variables): - - ehiDGEN_5_Pref - - ehiDGEN_5_Pers - - ehiDGEN_5_Val - - 10-Year DGEN EHI (3 variables): - - ehiDGEN_10_Pref - - ehiDGEN_10_Pers - - ehiDGEN_10_Val - -TRANSFORMATION LOGIC: - Formula: DGEN_past - DGEN_fut - - All calculations use signed differences: - - ehiDGEN_5_Pref = DGEN_past_5_Pref - DGEN_fut_5_Pref - - ehiDGEN_5_Pers = DGEN_past_5_Pers - DGEN_fut_5_Pers - - ehiDGEN_5_Val = DGEN_past_5_Val - DGEN_fut_5_Val - - ehiDGEN_10_Pref = DGEN_past_10_Pref - DGEN_fut_10_Pref - - ehiDGEN_10_Pers = DGEN_past_10_Pers - DGEN_fut_10_Pers - - ehiDGEN_10_Val = DGEN_past_10_Val - DGEN_fut_10_Val - - Result: Positive = greater past change, Negative = greater future change - -QUALITY ASSURANCE: - - QA checks for all 6 variables across all rows - - First 5 rows displayed with detailed calculations - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 13: datap 13 - ehi domain specific means.r -================================================================================ - -PURPOSE: - Calculates domain-level mean EHI scores by averaging the 5 items within each - domain (Preferences, Personality, Values) for each time interval. - -VARIABLES CREATED: 9 total (3 domains × 3 time intervals) - -SOURCE COLUMNS: - - ehi5_pref_read through ehi5_val_justice (15 columns) - - ehi10_pref_read through ehi10_val_justice (15 columns) - - ehi5.10_pref_read through ehi5.10_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year Domain Means (3 variables): - - ehi5_pref_MEAN (mean of 5 preference items) - - ehi5_pers_MEAN (mean of 5 personality items) - - ehi5_val_MEAN (mean of 5 values items) - - 10-Year Domain Means (3 variables): - - ehi10_pref_MEAN - - ehi10_pers_MEAN - - ehi10_val_MEAN - - 5-10 Year Change Domain Means (3 variables): - - ehi5.10_pref_MEAN - - ehi5.10_pers_MEAN - - ehi5.10_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for ehi5_pref_MEAN: - = mean(ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, - ehi5_pref_nap, ehi5_pref_travel) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - Comprehensive QA for all 9 variables across all rows - - First 5 rows displayed for multiple domain means - - Pass/Fail status for each variable - - -================================================================================ -SCRIPT 14: datap 14 - all ehi global means.r -================================================================================ - -PURPOSE: - Calculates global EHI means by averaging domain-level means. Creates the - highest-level summary scores for EHI across both domain-general and - domain-specific measures. - -VARIABLES CREATED: 5 total - -SOURCE COLUMNS: - - ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val - - ehiDGEN_10_Pref, ehiDGEN_10_Pers, ehiDGEN_10_Val - - ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN - - ehi10_pref_MEAN, ehi10_pers_MEAN, ehi10_val_MEAN - - ehi5.10_pref_MEAN, ehi5.10_pers_MEAN, ehi5.10_val_MEAN - -TARGET VARIABLES: - DGEN Global Means (2 variables): - - ehiDGEN_5_mean (mean of 3 DGEN domains for 5-year) - - ehiDGEN_10_mean (mean of 3 DGEN domains for 10-year) - - Domain-Specific Global Means (3 variables): - - ehi5_global_mean (mean of 3 domain means for 5-year) - - ehi10_global_mean (mean of 3 domain means for 10-year) - - ehi5.10_global_mean (mean of 3 domain means for 5-10 change) - -TRANSFORMATION LOGIC: - Each global mean = average of 3 domain-level scores - - Example for ehiDGEN_5_mean: - = mean(ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val) - - Example for ehi5_global_mean: - = mean(ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - QA for all 5 global means across all rows - - First 5 rows displayed with detailed calculations - - Values shown with 5 decimal precision - - Pass/Fail status for each variable - - -================================================================================ -SCRIPT 15: datap 15 - education recoded ordinal 3.r -================================================================================ - -PURPOSE: - Recodes raw education categories (`demo_edu`) into an ordered 3-level factor - for analyses requiring an ordinal education variable. - -VARIABLES CREATED: 1 total - -SOURCE COLUMNS: - - demo_edu - -TARGET VARIABLES: - - edu3 (ordered factor with 3 levels) - -TRANSFORMATION LOGIC: - Map `demo_edu` to 3 ordered levels and store as an ordered factor: - - "HS_TS": High School (or equivalent), Trade School (non-military) - - "C_Ug": College Diploma/Certificate, University - Undergraduate - - "grad_prof": University - Graduate (Masters), University - PhD, Professional Degree (ex. JD/MD) - - Levels and order: - edu3 = factor(edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) - -QUALITY ASSURANCE: - - Prints frequency table for `edu3` and a cross-tab of `demo_edu` × `edu3` to - verify correct mapping and absence of unintended NAs. - - Saves updated dataset to `eohi2.csv`. - - -================================================================================ -SCRIPT 16: datap 16 - ehi vars standardized .r -================================================================================ - -PURPOSE: - Standardizes key EHI summary variables (z-scores) and creates a composite - standardized EHI mean (`stdEHI_mean`) for use in correlational and regression - analyses. - -VARIABLES CREATED: 5 total - -SOURCE COLUMNS: - - ehiDGEN_5_mean, ehiDGEN_10_mean - - ehi5_global_mean, ehi10_global_mean - -TARGET VARIABLES: - - stdDGEN_5 = z(ehiDGEN_5_mean) - - stdDGEN_10 = z(ehiDGEN_10_mean) - - stdDS_5 = z(ehi5_global_mean) - - stdDS_10 = z(ehi10_global_mean) - - stdEHI_mean = mean(stdDGEN_5, stdDGEN_10, stdDS_5, stdDS_10), row-wise - -TRANSFORMATION LOGIC: - Standardize each source variable using sample mean and SD (na.rm = TRUE): - stdX = (X - mean(X)) / sd(X) - - Then compute row-wise average across the four standardized variables: - stdEHI_mean = rowMeans(cbind(stdDGEN_5, stdDGEN_10, stdDS_5, stdDS_10), - na.rm = TRUE) - -CHECKS/QA: - - Prints pre-standardization means/SDs and post-standardization means/SDs to - confirm ~0 mean and ~1 SD for each standardized variable (allowing for NAs). - - Spot-checks random rows by recomputing standardized values and comparing to - stored columns. - - Saves updated dataset to `eohi2.csv`. - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 291 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN time period means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - - Script 08: 6 variables (DGEN 5-vs-10 differences) - - Script 09: 11 variables (interval × direction means) - - Script 10: 6 variables (DGEN combined means) - - Script 11: 45 variables (domain-specific EHI scores) - - Script 12: 6 variables (DGEN EHI scores) - - Script 13: 9 variables (EHI domain means) - - Script 14: 5 variables (EHI global means) - - Script 15: 1 variable (education ordinal factor) - - Script 16: 5 variables (standardized EHI summaries and composite) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (28 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Time period means: 4 (1 per time period) - * 5-vs-10 differences: 6 (3 domains × 2 directions) - * Combined means: 6 (past, future, interval-based, domain-based) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - - Interval × Direction Means (11 total): - * Narrow-scope means: 6 (NPast_5, NPast_10, NFut_5, NFut_10, - X5.10past, X5.10fut) - * Global-scope means: 5 (NPast_global, NFut_global, X5.10_global, - N5_global, N10_global) - - - EHI Variables (60 total): - * Domain-specific EHI: 45 (15 items × 3 time intervals) - * DGEN EHI: 6 (3 domains × 2 time intervals) - * Domain means: 9 (3 domains × 3 time intervals) - * Global means: 5 (2 DGEN + 3 domain-specific) - - Standardized EHI Variables (5 total): - * stdDGEN_5, stdDGEN_10, stdDS_5, stdDS_10, stdEHI_mean - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 16) as later scripts depend - on variables created by earlier scripts. - - Key Dependencies: - - Script 03 required before Script 04, 08, 10, 12 (DGEN scores) - - Script 04 required before Script 10 (DGEN time period means) - - Script 06 required before Script 07, 09, 11 (time interval differences) - - Script 11 required before Script 13 (domain-specific EHI items) - - Script 12 required before Script 14 (DGEN EHI scores) - - Script 13 required before Script 14 (EHI domain means) - - Script 14 required before Script 16 (uses ehiDGEN_5/10_mean, ehi5/10_global_mean) - - Script 15 can run anytime after raw `demo_edu` is present; run before - analyses needing `edu3` - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - - Scripts 09-16 include comprehensive QA with first 5 rows displayed - - All EHI scripts (11-14, 16) verify calculations against stored values - - Pass/Fail status reported for all variables in QA-enabled scripts - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Scripts 08 and 09 save directly to eohi2.csv - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - "X" prefix added to variables starting with numbers (X5.10past_mean) - - Global means use "_global_" to distinguish from narrow-scope means - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -EHI CONCEPT AND INTERPRETATION -================================================================================ - -ENDURING HEDONIC IMPACT (EHI): - EHI measures the asymmetry between perceived past and future change in - psychological attributes. The concept is based on the premise that people - may perceive their past and future selves differently, even when considering - equivalent time distances. - -KEY EHI VARIABLES: - - Domain-Specific EHI (Scripts 11, 13, 14): - Calculated from item-level differences between past and future responses - Formula: NPast - NFut - * Positive values: Greater perceived change from past to present - * Negative values: Greater perceived change from present to future - * Zero: Symmetric perception of past and future change - - - Domain-General EHI (Scripts 12, 14): - Calculated from DGEN single-item responses - Formula: DGEN_past - DGEN_fut - * Measures broader temporal self-perception without item-level detail - -HIERARCHICAL STRUCTURE: - Level 1: Item-level EHI (45 domain-specific, 6 DGEN) - Level 2: Domain means (9 domain-specific, combining 5 items each) - Level 3: Global means (5 highest-level summaries) - -INTERPRETATION: - - EHI > 0: "Past asymmetry" - Person perceives greater change from past - - EHI < 0: "Future asymmetry" - Person perceives greater change to future - - EHI ≈ 0: "Temporal symmetry" - Balanced perception of past/future change - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 29, 2025 -Processing Pipeline: Scripts 01-16 - diff --git a/.history/eohi2/README_Variable_Creation_20251029133422.txt b/.history/eohi2/README_Variable_Creation_20251029133422.txt deleted file mode 100644 index e5821a2..0000000 --- a/.history/eohi2/README_Variable_Creation_20251029133422.txt +++ /dev/null @@ -1,1047 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through datap 16 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SCRIPT 08: dataP 08 - DGEN 510 vars.r -================================================================================ - -PURPOSE: - Calculates absolute differences between 5-year and 10-year DGEN ratings for - both Past and Future time directions. These variables measure the perceived - difference in domain-general change between the two time intervals. - -VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - Total: 12 DGEN columns (created in Script 03) - -TARGET VARIABLES: - Past Direction (3 variables): - - X5_10DGEN_past_pref (|DGEN_past_5_Pref - DGEN_past_10_Pref|) - - X5_10DGEN_past_pers (|DGEN_past_5_Pers - DGEN_past_10_Pers|) - - X5_10DGEN_past_val (|DGEN_past_5_Val - DGEN_past_10_Val|) - - Future Direction (3 variables): - - X5_10DGEN_fut_pref (|DGEN_fut_5_Pref - DGEN_fut_10_Pref|) - - X5_10DGEN_fut_pers (|DGEN_fut_5_Pers - DGEN_fut_10_Pers|) - - X5_10DGEN_fut_val (|DGEN_fut_5_Val - DGEN_fut_10_Val|) - -TRANSFORMATION LOGIC: - Formula: |DGEN_5 - DGEN_10| - - All calculations use absolute differences: - - Past Preferences: |DGEN_past_5_Pref - DGEN_past_10_Pref| - - Past Personality: |DGEN_past_5_Pers - DGEN_past_10_Pers| - - Past Values: |DGEN_past_5_Val - DGEN_past_10_Val| - - Future Preferences: |DGEN_fut_5_Pref - DGEN_fut_10_Pref| - - Future Personality: |DGEN_fut_5_Pers - DGEN_fut_10_Pers| - - Future Values: |DGEN_fut_5_Val - DGEN_fut_10_Val| - - Result: Always positive values representing magnitude of difference - Missing values in either source column result in NA - -SPECIAL NOTES: - - Variable names use "X" prefix because R automatically adds it to column - names starting with numbers (5_10 becomes X5_10) - - This script depends on Script 03 being run first - - Measures interval effects within time direction (past vs future) - - Parallel to Script 06's 5.10past and 5.10fut variables but for DGEN scores - - -================================================================================ -SCRIPT 09: dataP 09 - interval x direction means.r -================================================================================ - -PURPOSE: - Calculates comprehensive mean scores by averaging item-level differences - across intervals and directions. Creates both narrow-scope means (single - time interval) and broad-scope global means (combining multiple intervals). - -VARIABLES CREATED: 11 total (6 narrow-scope + 5 global-scope) - -SOURCE COLUMNS: - All 90 difference variables created in Script 06: - - NPast_5_[domain]_[item] (15 variables) - - NPast_10_[domain]_[item] (15 variables) - - NFut_5_[domain]_[item] (15 variables) - - NFut_10_[domain]_[item] (15 variables) - - X5.10past_[domain]_[item] (15 variables) - - X5.10fut_[domain]_[item] (15 variables) - -TARGET VARIABLES: - - Narrow-Scope Means (15 source items each): - - NPast_5_mean (mean across all 15 NPast_5 items) - - NPast_10_mean (mean across all 15 NPast_10 items) - - NFut_5_mean (mean across all 15 NFut_5 items) - - NFut_10_mean (mean across all 15 NFut_10 items) - - X5.10past_mean (mean across all 15 X5.10past items) - - X5.10fut_mean (mean across all 15 X5.10fut items) - - Global-Scope Means (30 source items each): - - NPast_global_mean (NPast_5 + NPast_10: all past intervals) - - NFut_global_mean (NFut_5 + NFut_10: all future intervals) - - X5.10_global_mean (X5.10past + X5.10fut: all 5-vs-10 intervals) - - N5_global_mean (NPast_5 + NFut_5: all 5-year intervals) - - N10_global_mean (NPast_10 + NFut_10: all 10-year intervals) - -TRANSFORMATION LOGIC: - - Narrow-Scope Means (15 items each): - Each mean averages all 15 difference items within one time interval - - Example for NPast_5_mean: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel, - NPast_5_pers_extravert, NPast_5_pers_critical, - NPast_5_pers_dependable, NPast_5_pers_anxious, - NPast_5_pers_complex, - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice) - - Global-Scope Means (30 items each): - Each mean averages 30 difference items across two related intervals - - Example for NPast_global_mean: - = mean(all 15 NPast_5 items + all 15 NPast_10 items) - Represents overall perceived change from present to any past timepoint - - Example for N5_global_mean: - = mean(all 15 NPast_5 items + all 15 NFut_5 items) - Represents overall perceived change at 5-year interval regardless of - direction - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF INTERVAL × DIRECTION MEANS: - - Narrow-scope means: Single-interval summaries across all domains and items - - Global-scope means: Cross-interval summaries for testing: - * Direction effects (past vs future) - * Interval effects (5-year vs 10-year) - * Combined temporal distance effects - - Enables comprehensive analysis of temporal self-perception patterns - - Reduces item-level and domain-level noise through broad aggregation - -QUALITY ASSURANCE: - - Script includes automated QA checks for first 5 rows - - Manually recalculates each mean and verifies against stored values - - Prints TRUE/FALSE match status for each variable - - Ensures calculation accuracy before further analysis - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - All means are averages of absolute difference scores (non-negative) - - Global means provide the broadest temporal self-perception summaries - - Naming convention uses "global" for 30-item means, no suffix for 15-item - - -================================================================================ -SCRIPT 10: dataP 10 - DGEN mean vars.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging across different time combinations. - Creates means for Past, Future, and interval-based (5-year, 10-year) groupings. - -VARIABLES CREATED: 6 total - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - Direction-Based Means (2 variables): - - DGEN_past_mean (mean of past_5_mean and past_10_mean) - - DGEN_fut_mean (mean of fut_5_mean and fut_10_mean) - - Interval-Based Means (2 variables): - - DGEN_5_mean (mean of past_5_mean and fut_5_mean) - - DGEN_10_mean (mean of past_10_mean and fut_10_mean) - - Domain-Based Means (2 variables): - - DGEN_pref_mean (mean across all 4 time periods for Preferences) - - DGEN_pers_mean (mean across all 4 time periods for Personality) - -TRANSFORMATION LOGIC: - Direction-based: - - DGEN_past_mean = mean(DGEN_past_5_mean, DGEN_past_10_mean) - - DGEN_fut_mean = mean(DGEN_fut_5_mean, DGEN_fut_10_mean) - - Interval-based: - - DGEN_5_mean = mean(DGEN_past_5_mean, DGEN_fut_5_mean) - - DGEN_10_mean = mean(DGEN_past_10_mean, DGEN_fut_10_mean) - - Domain-based: - - DGEN_pref_mean = mean across all 4 Pref scores - - DGEN_pers_mean = mean across all 4 Pers scores - - NA values excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 11: dataP 11 - CORRECT ehi vars.r -================================================================================ - -PURPOSE: - Creates Enduring Hedonic Impact (EHI) variables by calculating differences - between Past and Future responses for each item across different time intervals. - Formula: NPast - NFut (positive values indicate greater past-present change) - -VARIABLES CREATED: 45 total (15 items × 3 time intervals) - -SOURCE COLUMNS: - 5-year intervals: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - 10-year intervals: - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5-10 year change: - - X5.10past_pref_read through X5.10past_val_justice (15 columns) - - X5.10fut_pref_read through X5.10fut_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year EHI Variables (15 variables): - - ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, ehi5_pref_nap, - ehi5_pref_travel - - ehi5_pers_extravert, ehi5_pers_critical, ehi5_pers_dependable, - ehi5_pers_anxious, ehi5_pers_complex - - ehi5_val_obey, ehi5_val_trad, ehi5_val_opinion, ehi5_val_performance, - ehi5_val_justice - - 10-Year EHI Variables (15 variables): - - ehi10_pref_read, ehi10_pref_music, ehi10_pref_TV, ehi10_pref_nap, - ehi10_pref_travel - - ehi10_pers_extravert, ehi10_pers_critical, ehi10_pers_dependable, - ehi10_pers_anxious, ehi10_pers_complex - - ehi10_val_obey, ehi10_val_trad, ehi10_val_opinion, ehi10_val_performance, - ehi10_val_justice - - 5-10 Year Change EHI Variables (15 variables): - - ehi5.10_pref_read, ehi5.10_pref_music, ehi5.10_pref_TV, ehi5.10_pref_nap, - ehi5.10_pref_travel - - ehi5.10_pers_extravert, ehi5.10_pers_critical, ehi5.10_pers_dependable, - ehi5.10_pers_anxious, ehi5.10_pers_complex - - ehi5.10_val_obey, ehi5.10_val_trad, ehi5.10_val_opinion, - ehi5.10_val_performance, ehi5.10_val_justice - -TRANSFORMATION LOGIC: - Formula: NPast - NFut - - All calculations use signed differences: - - ehi5_[item] = NPast_5_[item] - NFut_5_[item] - - ehi10_[item] = NPast_10_[item] - NFut_10_[item] - - ehi5.10_[item] = X5.10past_[item] - X5.10fut_[item] - - Result: Positive = greater past change, Negative = greater future change - Missing values in either source column result in NA - -QUALITY ASSURANCE: - - Comprehensive QA checks for all 45 variables across all rows - - First 5 rows displayed with detailed calculations showing source values, - computed differences, and stored values - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 12: dataP 12 - CORRECT DGEN ehi vars.r -================================================================================ - -PURPOSE: - Creates domain-general EHI variables by calculating differences between Past - and Future DGEN responses. These are the domain-general parallel to Script 11's - domain-specific EHI variables. - -VARIABLES CREATED: 6 total (3 domains × 2 time intervals) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - 5-Year DGEN EHI (3 variables): - - ehiDGEN_5_Pref - - ehiDGEN_5_Pers - - ehiDGEN_5_Val - - 10-Year DGEN EHI (3 variables): - - ehiDGEN_10_Pref - - ehiDGEN_10_Pers - - ehiDGEN_10_Val - -TRANSFORMATION LOGIC: - Formula: DGEN_past - DGEN_fut - - All calculations use signed differences: - - ehiDGEN_5_Pref = DGEN_past_5_Pref - DGEN_fut_5_Pref - - ehiDGEN_5_Pers = DGEN_past_5_Pers - DGEN_fut_5_Pers - - ehiDGEN_5_Val = DGEN_past_5_Val - DGEN_fut_5_Val - - ehiDGEN_10_Pref = DGEN_past_10_Pref - DGEN_fut_10_Pref - - ehiDGEN_10_Pers = DGEN_past_10_Pers - DGEN_fut_10_Pers - - ehiDGEN_10_Val = DGEN_past_10_Val - DGEN_fut_10_Val - - Result: Positive = greater past change, Negative = greater future change - -QUALITY ASSURANCE: - - QA checks for all 6 variables across all rows - - First 5 rows displayed with detailed calculations - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 13: datap 13 - ehi domain specific means.r -================================================================================ - -PURPOSE: - Calculates domain-level mean EHI scores by averaging the 5 items within each - domain (Preferences, Personality, Values) for each time interval. - -VARIABLES CREATED: 9 total (3 domains × 3 time intervals) - -SOURCE COLUMNS: - - ehi5_pref_read through ehi5_val_justice (15 columns) - - ehi10_pref_read through ehi10_val_justice (15 columns) - - ehi5.10_pref_read through ehi5.10_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year Domain Means (3 variables): - - ehi5_pref_MEAN (mean of 5 preference items) - - ehi5_pers_MEAN (mean of 5 personality items) - - ehi5_val_MEAN (mean of 5 values items) - - 10-Year Domain Means (3 variables): - - ehi10_pref_MEAN - - ehi10_pers_MEAN - - ehi10_val_MEAN - - 5-10 Year Change Domain Means (3 variables): - - ehi5.10_pref_MEAN - - ehi5.10_pers_MEAN - - ehi5.10_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for ehi5_pref_MEAN: - = mean(ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, - ehi5_pref_nap, ehi5_pref_travel) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - Comprehensive QA for all 9 variables across all rows - - First 5 rows displayed for multiple domain means - - Pass/Fail status for each variable - - -================================================================================ -SCRIPT 14: datap 14 - all ehi global means.r -================================================================================ - -PURPOSE: - Calculates global EHI means by averaging domain-level means. Creates the - highest-level summary scores for EHI across both domain-general and - domain-specific measures. - -VARIABLES CREATED: 5 total - -SOURCE COLUMNS: - - ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val - - ehiDGEN_10_Pref, ehiDGEN_10_Pers, ehiDGEN_10_Val - - ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN - - ehi10_pref_MEAN, ehi10_pers_MEAN, ehi10_val_MEAN - - ehi5.10_pref_MEAN, ehi5.10_pers_MEAN, ehi5.10_val_MEAN - -TARGET VARIABLES: - DGEN Global Means (2 variables): - - ehiDGEN_5_mean (mean of 3 DGEN domains for 5-year) - - ehiDGEN_10_mean (mean of 3 DGEN domains for 10-year) - - Domain-Specific Global Means (3 variables): - - ehi5_global_mean (mean of 3 domain means for 5-year) - - ehi10_global_mean (mean of 3 domain means for 10-year) - - ehi5.10_global_mean (mean of 3 domain means for 5-10 change) - -TRANSFORMATION LOGIC: - Each global mean = average of 3 domain-level scores - - Example for ehiDGEN_5_mean: - = mean(ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val) - - Example for ehi5_global_mean: - = mean(ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - QA for all 5 global means across all rows - - First 5 rows displayed with detailed calculations - - Values shown with 5 decimal precision - - Pass/Fail status for each variable - - -================================================================================ -SCRIPT 15: datap 15 - education recoded ordinal 3.r -================================================================================ - -PURPOSE: - Recodes raw education categories (`demo_edu`) into an ordered 3-level factor - for analyses requiring an ordinal education variable. - -VARIABLES CREATED: 1 total - -SOURCE COLUMNS: - - demo_edu - -TARGET VARIABLES: - - edu3 (ordered factor with 3 levels) - -TRANSFORMATION LOGIC: - Map `demo_edu` to 3 ordered levels and store as an ordered factor: - - "HS_TS": High School (or equivalent), Trade School (non-military) - - "C_Ug": College Diploma/Certificate, University - Undergraduate - - "grad_prof": University - Graduate (Masters), University - PhD, Professional Degree (ex. JD/MD) - - Levels and order: - edu3 = factor(edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) - -QUALITY ASSURANCE: - - Prints frequency table for `edu3` and a cross-tab of `demo_edu` × `edu3` to - verify correct mapping and absence of unintended NAs. - - Saves updated dataset to `eohi2.csv`. - - -================================================================================ -SCRIPT 16: datap 16 - ehi vars standardized .r -================================================================================ - -PURPOSE: - Standardizes key EHI summary variables (z-scores) and creates a composite - standardized EHI mean (`stdEHI_mean`) for use in correlational and regression - analyses. - -VARIABLES CREATED: 5 total - -SOURCE COLUMNS: - - ehiDGEN_5_mean, ehiDGEN_10_mean - - ehi5_global_mean, ehi10_global_mean - -TARGET VARIABLES: - - stdDGEN_5 = z(ehiDGEN_5_mean) - - stdDGEN_10 = z(ehiDGEN_10_mean) - - stdDS_5 = z(ehi5_global_mean) - - stdDS_10 = z(ehi10_global_mean) - - stdEHI_mean = mean(stdDGEN_5, stdDGEN_10, stdDS_5, stdDS_10), row-wise - -TRANSFORMATION LOGIC: - Standardize each source variable using sample mean and SD (na.rm = TRUE): - stdX = (X - mean(X)) / sd(X) - - Then compute row-wise average across the four standardized variables: - stdEHI_mean = rowMeans(cbind(stdDGEN_5, stdDGEN_10, stdDS_5, stdDS_10), - na.rm = TRUE) - -CHECKS/QA: - - Prints pre-standardization means/SDs and post-standardization means/SDs to - confirm ~0 mean and ~1 SD for each standardized variable (allowing for NAs). - - Spot-checks random rows by recomputing standardized values and comparing to - stored columns. - - Saves updated dataset to `eohi2.csv`. - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 291 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN time period means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - - Script 08: 6 variables (DGEN 5-vs-10 differences) - - Script 09: 11 variables (interval × direction means) - - Script 10: 6 variables (DGEN combined means) - - Script 11: 45 variables (domain-specific EHI scores) - - Script 12: 6 variables (DGEN EHI scores) - - Script 13: 9 variables (EHI domain means) - - Script 14: 5 variables (EHI global means) - - Script 15: 1 variable (education ordinal factor) - - Script 16: 5 variables (standardized EHI summaries and composite) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (28 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Time period means: 4 (1 per time period) - * 5-vs-10 differences: 6 (3 domains × 2 directions) - * Combined means: 6 (past, future, interval-based, domain-based) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - - Interval × Direction Means (11 total): - * Narrow-scope means: 6 (NPast_5, NPast_10, NFut_5, NFut_10, - X5.10past, X5.10fut) - * Global-scope means: 5 (NPast_global, NFut_global, X5.10_global, - N5_global, N10_global) - - - EHI Variables (60 total): - * Domain-specific EHI: 45 (15 items × 3 time intervals) - * DGEN EHI: 6 (3 domains × 2 time intervals) - * Domain means: 9 (3 domains × 3 time intervals) - * Global means: 5 (2 DGEN + 3 domain-specific) - - Standardized EHI Variables (5 total): - * stdDGEN_5, stdDGEN_10, stdDS_5, stdDS_10, stdEHI_mean - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 16) as later scripts depend - on variables created by earlier scripts. - - Key Dependencies: - - Script 03 required before Script 04, 08, 10, 12 (DGEN scores) - - Script 04 required before Script 10 (DGEN time period means) - - Script 06 required before Script 07, 09, 11 (time interval differences) - - Script 11 required before Script 13 (domain-specific EHI items) - - Script 12 required before Script 14 (DGEN EHI scores) - - Script 13 required before Script 14 (EHI domain means) - - Script 14 required before Script 16 (uses ehiDGEN_5/10_mean, ehi5/10_global_mean) - - Script 15 can run anytime after raw `demo_edu` is present; run before - analyses needing `edu3` - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - - Scripts 09-16 include comprehensive QA with first 5 rows displayed - - All EHI scripts (11-14, 16) verify calculations against stored values - - Pass/Fail status reported for all variables in QA-enabled scripts - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Scripts 08 and 09 save directly to eohi2.csv - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - "X" prefix added to variables starting with numbers (X5.10past_mean) - - Global means use "_global_" to distinguish from narrow-scope means - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -EHI CONCEPT AND INTERPRETATION -================================================================================ - -ENDURING HEDONIC IMPACT (EHI): - EHI measures the asymmetry between perceived past and future change in - psychological attributes. The concept is based on the premise that people - may perceive their past and future selves differently, even when considering - equivalent time distances. - -KEY EHI VARIABLES: - - Domain-Specific EHI (Scripts 11, 13, 14): - Calculated from item-level differences between past and future responses - Formula: NPast - NFut - * Positive values: Greater perceived change from past to present - * Negative values: Greater perceived change from present to future - * Zero: Symmetric perception of past and future change - - - Domain-General EHI (Scripts 12, 14): - Calculated from DGEN single-item responses - Formula: DGEN_past - DGEN_fut - * Measures broader temporal self-perception without item-level detail - -HIERARCHICAL STRUCTURE: - Level 1: Item-level EHI (45 domain-specific, 6 DGEN) - Level 2: Domain means (9 domain-specific, combining 5 items each) - Level 3: Global means (5 highest-level summaries) - -INTERPRETATION: - - EHI > 0: "Past asymmetry" - Person perceives greater change from past - - EHI < 0: "Future asymmetry" - Person perceives greater change to future - - EHI ≈ 0: "Temporal symmetry" - Balanced perception of past/future change - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 29, 2025 -Processing Pipeline: Scripts 01-16 - diff --git a/.history/eohi2/README_Variable_Creation_20251029133433.txt b/.history/eohi2/README_Variable_Creation_20251029133433.txt deleted file mode 100644 index e5821a2..0000000 --- a/.history/eohi2/README_Variable_Creation_20251029133433.txt +++ /dev/null @@ -1,1047 +0,0 @@ -================================================================================ -EOHI2 DATA PROCESSING PIPELINE - VARIABLE CREATION DOCUMENTATION -================================================================================ - -This README documents the complete data processing pipeline for eohi2.csv. -All processing scripts should be run in the order listed below. - -Source File: eohi2.csv -Processing Scripts: dataP 01 through datap 16 - -================================================================================ -SCRIPT 01: dataP 01 - recode and combine past & future vars.r -================================================================================ - -PURPOSE: - Combines responses from two survey versions (01 and 02) and recodes Likert - scale text responses to numeric values for past and future time periods. - -VARIABLES CREATED: 60 total (15 items × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefItem_1 through 01fut10ValItem_5 (60 columns) - - Set B: 02past5PrefItem_1 through 02fut10ValItem_5 (60 columns) - -TARGET VARIABLES: - Past 5 Years (15 variables): - - past_5_pref_read, past_5_pref_music, past_5_pref_TV, past_5_pref_nap, - past_5_pref_travel - - past_5_pers_extravert, past_5_pers_critical, past_5_pers_dependable, - past_5_pers_anxious, past_5_pers_complex - - past_5_val_obey, past_5_val_trad, past_5_val_opinion, - past_5_val_performance, past_5_val_justice - - Past 10 Years (15 variables): - - past_10_pref_read, past_10_pref_music, past_10_pref_TV, past_10_pref_nap, - past_10_pref_travel - - past_10_pers_extravert, past_10_pers_critical, past_10_pers_dependable, - past_10_pers_anxious, past_10_pers_complex - - past_10_val_obey, past_10_val_trad, past_10_val_opinion, - past_10_val_performance, past_10_val_justice - - Future 5 Years (15 variables): - - fut_5_pref_read, fut_5_pref_music, fut_5_pref_TV, fut_5_pref_nap, - fut_5_pref_travel - - fut_5_pers_extravert, fut_5_pers_critical, fut_5_pers_dependable, - fut_5_pers_anxious, fut_5_pers_complex - - fut_5_val_obey, fut_5_val_trad, fut_5_val_opinion, - fut_5_val_performance, fut_5_val_justice - - Future 10 Years (15 variables): - - fut_10_pref_read, fut_10_pref_music, fut_10_pref_TV, fut_10_pref_nap, - fut_10_pref_travel - - fut_10_pers_extravert, fut_10_pers_critical, fut_10_pers_dependable, - fut_10_pers_anxious, fut_10_pers_complex - - fut_10_val_obey, fut_10_val_trad, fut_10_val_opinion, - fut_10_val_performance, fut_10_val_justice - -TRANSFORMATION LOGIC: - Step 1: Combine responses from Set A (01) and Set B (02) - - If Set A has a value, use Set A - - If Set A is empty, use Set B - - Step 2: Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -ITEM DOMAINS: - - Preferences (pref): Reading, Music, TV, Nap, Travel - - Personality (pers): Extravert, Critical, Dependable, Anxious, Complex - - Values (val): Obey, Tradition, Opinion, Performance, Justice - - -================================================================================ -SCRIPT 02: dataP 02 - recode present VARS.r -================================================================================ - -PURPOSE: - Recodes present-time Likert scale text responses to numeric values. - -VARIABLES CREATED: 15 total - -SOURCE COLUMNS: - - prePrefItem_1 through prePrefItem_5 (5 columns) - - prePersItem_1 through prePersItem_5 (5 columns) - - preValItem_1 through preValItem_5 (5 columns) - -TARGET VARIABLES: - Present Time (15 variables): - - present_pref_read, present_pref_music, present_pref_tv, present_pref_nap, - present_pref_travel - - present_pers_extravert, present_pers_critical, present_pers_dependable, - present_pers_anxious, present_pers_complex - - present_val_obey, present_val_trad, present_val_opinion, - present_val_performance, present_val_justice - -TRANSFORMATION LOGIC: - Recode text responses to numeric values: - "Strongly Disagree" → -3 - "Disagree" → -2 - "Somewhat Disagree" → -1 - "Neither Agree nor Disagree" → 0 - "Somewhat Agree" → 1 - "Agree" → 2 - "Strongly Agree" → 3 - Empty/Missing → NA - -SPECIAL NOTE: - Present time uses "present_pref_tv" (lowercase) while past/future use - "past_5_pref_TV" (uppercase). This is intentional and preserved from the - original data structure. - - -================================================================================ -SCRIPT 03: dataP 03 - recode DGEN vars.r -================================================================================ - -PURPOSE: - Combines DGEN (domain general) responses from two survey versions (01 and 02). - These are single-item measures for each domain/time combination. - NO RECODING - just copies numeric values as-is. - -VARIABLES CREATED: 12 total (3 domains × 4 time periods) - -SOURCE COLUMNS: - - Set A: 01past5PrefDGEN_1, 01past5PersDGEN_1, 01past5ValDGEN_1, etc. - - Set B: 02past5PrefDGEN_1, 02past5PersDGEN_1, 02past5ValDGEN_1, etc. - -TARGET VARIABLES: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TRANSFORMATION LOGIC: - - If Set A (01) has a value, use Set A - - If Set A is empty, use Set B (02) - - NO RECODING: Values are copied directly as numeric - -SPECIAL NOTES: - - Future columns in raw data use "_8" suffix for Pref/Pers items - - Future Val columns use "ValuesDGEN" spelling in Set A, "ValDGEN" in Set B - - -================================================================================ -SCRIPT 04: dataP 04 - DGEN means.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging the three domain scores (Preferences, - Personality, Values) for each time period. - -VARIABLES CREATED: 4 total (1 per time period) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - - DGEN_past_5_mean - - DGEN_past_10_mean - - DGEN_fut_5_mean - - DGEN_fut_10_mean - -TRANSFORMATION LOGIC: - Each mean = (Pref + Pers + Val) / 3 - - NA values are excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 05: dataP 05 - recode scales VARS.r -================================================================================ - -PURPOSE: - Processes two cognitive scales: - 1. AOT (Actively Open-minded Thinking): 8-item scale with reverse coding - 2. CRT (Cognitive Reflection Test): 3-item test with correct/intuitive scoring - -VARIABLES CREATED: 3 total - -SOURCE COLUMNS: - AOT Scale: - - aot_1, aot_2, aot_3, aot_4, aot_5, aot_6, aot_7, aot_8 - - CRT Test: - - crt_1, crt_2, crt_3 - -TARGET VARIABLES: - - aot_total (mean of 8 items with reverse coding) - - crt_correct (proportion of correct answers) - - crt_int (proportion of intuitive/incorrect answers) - -TRANSFORMATION LOGIC: - - AOT Scale (aot_total): - 1. Items 4, 5, 6, 7 are reverse coded by multiplying by -1 - 2. Calculate mean of all 8 items (with reverse coding applied) - 3. Original source values are NOT modified in the dataframe - 4. NA values excluded from calculation (na.rm = TRUE) - - CRT Correct (crt_correct): - Correct answers: - - crt_1: "5 cents" - - crt_2: "5 minutes" - - crt_3: "47 days" - Calculation: (Number of correct answers) / (Number of non-missing answers) - - CRT Intuitive (crt_int): - Intuitive (common incorrect) answers: - - crt_1: "10 cents" - - crt_2: "100 minutes" - - crt_3: "24 days" - Calculation: (Number of intuitive answers) / (Number of non-missing answers) - -SPECIAL NOTES: - - CRT scoring is case-insensitive and trims whitespace - - Both CRT scores are proportions (0.00 to 1.00) - - Empty/missing CRT responses are excluded from denominator - - -================================================================================ -SCRIPT 06: dataP 06 - time interval differences.r -================================================================================ - -PURPOSE: - Calculates absolute differences between time intervals to measure perceived - change across time periods for all 15 items. - -VARIABLES CREATED: 90 total (6 difference types × 15 items) - -SOURCE COLUMNS: - - present_pref_read through present_val_justice (15 columns) - - past_5_pref_read through past_5_val_justice (15 columns) - - past_10_pref_read through past_10_val_justice (15 columns) - - fut_5_pref_read through fut_5_val_justice (15 columns) - - fut_10_pref_read through fut_10_val_justice (15 columns) - -TARGET VARIABLES (by difference type): - - NPast_5 (Present vs Past 5 years) - 15 variables: - Formula: |present - past_5| - - NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, NPast_5_pref_nap, - NPast_5_pref_travel - - NPast_5_pers_extravert, NPast_5_pers_critical, NPast_5_pers_dependable, - NPast_5_pers_anxious, NPast_5_pers_complex - - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice - - NPast_10 (Present vs Past 10 years) - 15 variables: - Formula: |present - past_10| - - NPast_10_pref_read, NPast_10_pref_music, NPast_10_pref_TV, - NPast_10_pref_nap, NPast_10_pref_travel - - NPast_10_pers_extravert, NPast_10_pers_critical, NPast_10_pers_dependable, - NPast_10_pers_anxious, NPast_10_pers_complex - - NPast_10_val_obey, NPast_10_val_trad, NPast_10_val_opinion, - NPast_10_val_performance, NPast_10_val_justice - - NFut_5 (Present vs Future 5 years) - 15 variables: - Formula: |present - fut_5| - - NFut_5_pref_read, NFut_5_pref_music, NFut_5_pref_TV, NFut_5_pref_nap, - NFut_5_pref_travel - - NFut_5_pers_extravert, NFut_5_pers_critical, NFut_5_pers_dependable, - NFut_5_pers_anxious, NFut_5_pers_complex - - NFut_5_val_obey, NFut_5_val_trad, NFut_5_val_opinion, - NFut_5_val_performance, NFut_5_val_justice - - NFut_10 (Present vs Future 10 years) - 15 variables: - Formula: |present - fut_10| - - NFut_10_pref_read, NFut_10_pref_music, NFut_10_pref_TV, NFut_10_pref_nap, - NFut_10_pref_travel - - NFut_10_pers_extravert, NFut_10_pers_critical, NFut_10_pers_dependable, - NFut_10_pers_anxious, NFut_10_pers_complex - - NFut_10_val_obey, NFut_10_val_trad, NFut_10_val_opinion, - NFut_10_val_performance, NFut_10_val_justice - - 5.10past (Past 5 vs Past 10 years) - 15 variables: - Formula: |past_5 - past_10| - - 5.10past_pref_read, 5.10past_pref_music, 5.10past_pref_TV, - 5.10past_pref_nap, 5.10past_pref_travel - - 5.10past_pers_extravert, 5.10past_pers_critical, 5.10past_pers_dependable, - 5.10past_pers_anxious, 5.10past_pers_complex - - 5.10past_val_obey, 5.10past_val_trad, 5.10past_val_opinion, - 5.10past_val_performance, 5.10past_val_justice - - 5.10fut (Future 5 vs Future 10 years) - 15 variables: - Formula: |fut_5 - fut_10| - - 5.10fut_pref_read, 5.10fut_pref_music, 5.10fut_pref_TV, 5.10fut_pref_nap, - 5.10fut_pref_travel - - 5.10fut_pers_extravert, 5.10fut_pers_critical, 5.10fut_pers_dependable, - 5.10fut_pers_anxious, 5.10fut_pers_complex - - 5.10fut_val_obey, 5.10fut_val_trad, 5.10fut_val_opinion, - 5.10fut_val_performance, 5.10fut_val_justice - -TRANSFORMATION LOGIC: - All calculations use absolute differences: - - NPast_5: |present_[item] - past_5_[item]| - - NPast_10: |present_[item] - past_10_[item]| - - NFut_5: |present_[item] - fut_5_[item]| - - NFut_10: |present_[item] - fut_10_[item]| - - 5.10past: |past_5_[item] - past_10_[item]| - - 5.10fut: |fut_5_[item] - fut_10_[item]| - - Result: Always positive values representing magnitude of change - Missing values in either source column result in NA - -SPECIAL NOTES: - - Present time uses "pref_tv" (lowercase) while past/future use "pref_TV" - (uppercase), so script handles this naming inconsistency - - All values are absolute differences (non-negative) - - -================================================================================ -SCRIPT 07: dataP 07 - domain means.r -================================================================================ - -PURPOSE: - Calculates domain-level means by averaging the 5 items within each domain - (Preferences, Personality, Values) for each of the 6 time interval difference - types. - -VARIABLES CREATED: 18 total (6 time intervals × 3 domains) - -SOURCE COLUMNS: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5.10past_pref_read through 5.10past_val_justice (15 columns) - - 5.10fut_pref_read through 5.10fut_val_justice (15 columns) - Total: 90 difference columns (created in Script 06) - -TARGET VARIABLES: - NPast_5 Domain Means (3 variables): - - NPast_5_pref_MEAN (mean of 5 preference items) - - NPast_5_pers_MEAN (mean of 5 personality items) - - NPast_5_val_MEAN (mean of 5 values items) - - NPast_10 Domain Means (3 variables): - - NPast_10_pref_MEAN - - NPast_10_pers_MEAN - - NPast_10_val_MEAN - - NFut_5 Domain Means (3 variables): - - NFut_5_pref_MEAN - - NFut_5_pers_MEAN - - NFut_5_val_MEAN - - NFut_10 Domain Means (3 variables): - - NFut_10_pref_MEAN - - NFut_10_pers_MEAN - - NFut_10_val_MEAN - - 5.10past Domain Means (3 variables): - - 5.10past_pref_MEAN - - 5.10past_pers_MEAN - - 5.10past_val_MEAN - - 5.10fut Domain Means (3 variables): - - 5.10fut_pref_MEAN - - 5.10fut_pers_MEAN - - 5.10fut_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for NPast_5_pref_MEAN: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel) - - Example for NFut_10_pers_MEAN: - = mean(NFut_10_pers_extravert, NFut_10_pers_critical, - NFut_10_pers_dependable, NFut_10_pers_anxious, - NFut_10_pers_complex) - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF DOMAIN MEANS: - - Provides higher-level summary of perceived change by domain - - Reduces item-level noise by aggregating across related items - - Enables domain-level comparisons across time intervals - - Parallel to Script 04 (DGEN means) but for difference scores instead of - raw DGEN ratings - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - Creates domain-level aggregates of absolute difference scores - - All means are averages of non-negative values (absolute differences) - - -================================================================================ -SCRIPT 08: dataP 08 - DGEN 510 vars.r -================================================================================ - -PURPOSE: - Calculates absolute differences between 5-year and 10-year DGEN ratings for - both Past and Future time directions. These variables measure the perceived - difference in domain-general change between the two time intervals. - -VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - Total: 12 DGEN columns (created in Script 03) - -TARGET VARIABLES: - Past Direction (3 variables): - - X5_10DGEN_past_pref (|DGEN_past_5_Pref - DGEN_past_10_Pref|) - - X5_10DGEN_past_pers (|DGEN_past_5_Pers - DGEN_past_10_Pers|) - - X5_10DGEN_past_val (|DGEN_past_5_Val - DGEN_past_10_Val|) - - Future Direction (3 variables): - - X5_10DGEN_fut_pref (|DGEN_fut_5_Pref - DGEN_fut_10_Pref|) - - X5_10DGEN_fut_pers (|DGEN_fut_5_Pers - DGEN_fut_10_Pers|) - - X5_10DGEN_fut_val (|DGEN_fut_5_Val - DGEN_fut_10_Val|) - -TRANSFORMATION LOGIC: - Formula: |DGEN_5 - DGEN_10| - - All calculations use absolute differences: - - Past Preferences: |DGEN_past_5_Pref - DGEN_past_10_Pref| - - Past Personality: |DGEN_past_5_Pers - DGEN_past_10_Pers| - - Past Values: |DGEN_past_5_Val - DGEN_past_10_Val| - - Future Preferences: |DGEN_fut_5_Pref - DGEN_fut_10_Pref| - - Future Personality: |DGEN_fut_5_Pers - DGEN_fut_10_Pers| - - Future Values: |DGEN_fut_5_Val - DGEN_fut_10_Val| - - Result: Always positive values representing magnitude of difference - Missing values in either source column result in NA - -SPECIAL NOTES: - - Variable names use "X" prefix because R automatically adds it to column - names starting with numbers (5_10 becomes X5_10) - - This script depends on Script 03 being run first - - Measures interval effects within time direction (past vs future) - - Parallel to Script 06's 5.10past and 5.10fut variables but for DGEN scores - - -================================================================================ -SCRIPT 09: dataP 09 - interval x direction means.r -================================================================================ - -PURPOSE: - Calculates comprehensive mean scores by averaging item-level differences - across intervals and directions. Creates both narrow-scope means (single - time interval) and broad-scope global means (combining multiple intervals). - -VARIABLES CREATED: 11 total (6 narrow-scope + 5 global-scope) - -SOURCE COLUMNS: - All 90 difference variables created in Script 06: - - NPast_5_[domain]_[item] (15 variables) - - NPast_10_[domain]_[item] (15 variables) - - NFut_5_[domain]_[item] (15 variables) - - NFut_10_[domain]_[item] (15 variables) - - X5.10past_[domain]_[item] (15 variables) - - X5.10fut_[domain]_[item] (15 variables) - -TARGET VARIABLES: - - Narrow-Scope Means (15 source items each): - - NPast_5_mean (mean across all 15 NPast_5 items) - - NPast_10_mean (mean across all 15 NPast_10 items) - - NFut_5_mean (mean across all 15 NFut_5 items) - - NFut_10_mean (mean across all 15 NFut_10 items) - - X5.10past_mean (mean across all 15 X5.10past items) - - X5.10fut_mean (mean across all 15 X5.10fut items) - - Global-Scope Means (30 source items each): - - NPast_global_mean (NPast_5 + NPast_10: all past intervals) - - NFut_global_mean (NFut_5 + NFut_10: all future intervals) - - X5.10_global_mean (X5.10past + X5.10fut: all 5-vs-10 intervals) - - N5_global_mean (NPast_5 + NFut_5: all 5-year intervals) - - N10_global_mean (NPast_10 + NFut_10: all 10-year intervals) - -TRANSFORMATION LOGIC: - - Narrow-Scope Means (15 items each): - Each mean averages all 15 difference items within one time interval - - Example for NPast_5_mean: - = mean(NPast_5_pref_read, NPast_5_pref_music, NPast_5_pref_TV, - NPast_5_pref_nap, NPast_5_pref_travel, - NPast_5_pers_extravert, NPast_5_pers_critical, - NPast_5_pers_dependable, NPast_5_pers_anxious, - NPast_5_pers_complex, - NPast_5_val_obey, NPast_5_val_trad, NPast_5_val_opinion, - NPast_5_val_performance, NPast_5_val_justice) - - Global-Scope Means (30 items each): - Each mean averages 30 difference items across two related intervals - - Example for NPast_global_mean: - = mean(all 15 NPast_5 items + all 15 NPast_10 items) - Represents overall perceived change from present to any past timepoint - - Example for N5_global_mean: - = mean(all 15 NPast_5 items + all 15 NFut_5 items) - Represents overall perceived change at 5-year interval regardless of - direction - - NA values excluded from calculation (na.rm = TRUE) - -PURPOSE OF INTERVAL × DIRECTION MEANS: - - Narrow-scope means: Single-interval summaries across all domains and items - - Global-scope means: Cross-interval summaries for testing: - * Direction effects (past vs future) - * Interval effects (5-year vs 10-year) - * Combined temporal distance effects - - Enables comprehensive analysis of temporal self-perception patterns - - Reduces item-level and domain-level noise through broad aggregation - -QUALITY ASSURANCE: - - Script includes automated QA checks for first 5 rows - - Manually recalculates each mean and verifies against stored values - - Prints TRUE/FALSE match status for each variable - - Ensures calculation accuracy before further analysis - -SPECIAL NOTES: - - This script depends on Script 06 being run first - - All means are averages of absolute difference scores (non-negative) - - Global means provide the broadest temporal self-perception summaries - - Naming convention uses "global" for 30-item means, no suffix for 15-item - - -================================================================================ -SCRIPT 10: dataP 10 - DGEN mean vars.r -================================================================================ - -PURPOSE: - Calculates mean DGEN scores by averaging across different time combinations. - Creates means for Past, Future, and interval-based (5-year, 10-year) groupings. - -VARIABLES CREATED: 6 total - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - Direction-Based Means (2 variables): - - DGEN_past_mean (mean of past_5_mean and past_10_mean) - - DGEN_fut_mean (mean of fut_5_mean and fut_10_mean) - - Interval-Based Means (2 variables): - - DGEN_5_mean (mean of past_5_mean and fut_5_mean) - - DGEN_10_mean (mean of past_10_mean and fut_10_mean) - - Domain-Based Means (2 variables): - - DGEN_pref_mean (mean across all 4 time periods for Preferences) - - DGEN_pers_mean (mean across all 4 time periods for Personality) - -TRANSFORMATION LOGIC: - Direction-based: - - DGEN_past_mean = mean(DGEN_past_5_mean, DGEN_past_10_mean) - - DGEN_fut_mean = mean(DGEN_fut_5_mean, DGEN_fut_10_mean) - - Interval-based: - - DGEN_5_mean = mean(DGEN_past_5_mean, DGEN_fut_5_mean) - - DGEN_10_mean = mean(DGEN_past_10_mean, DGEN_fut_10_mean) - - Domain-based: - - DGEN_pref_mean = mean across all 4 Pref scores - - DGEN_pers_mean = mean across all 4 Pers scores - - NA values excluded from calculation (na.rm = TRUE) - - -================================================================================ -SCRIPT 11: dataP 11 - CORRECT ehi vars.r -================================================================================ - -PURPOSE: - Creates Enduring Hedonic Impact (EHI) variables by calculating differences - between Past and Future responses for each item across different time intervals. - Formula: NPast - NFut (positive values indicate greater past-present change) - -VARIABLES CREATED: 45 total (15 items × 3 time intervals) - -SOURCE COLUMNS: - 5-year intervals: - - NPast_5_pref_read through NPast_5_val_justice (15 columns) - - NFut_5_pref_read through NFut_5_val_justice (15 columns) - - 10-year intervals: - - NPast_10_pref_read through NPast_10_val_justice (15 columns) - - NFut_10_pref_read through NFut_10_val_justice (15 columns) - - 5-10 year change: - - X5.10past_pref_read through X5.10past_val_justice (15 columns) - - X5.10fut_pref_read through X5.10fut_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year EHI Variables (15 variables): - - ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, ehi5_pref_nap, - ehi5_pref_travel - - ehi5_pers_extravert, ehi5_pers_critical, ehi5_pers_dependable, - ehi5_pers_anxious, ehi5_pers_complex - - ehi5_val_obey, ehi5_val_trad, ehi5_val_opinion, ehi5_val_performance, - ehi5_val_justice - - 10-Year EHI Variables (15 variables): - - ehi10_pref_read, ehi10_pref_music, ehi10_pref_TV, ehi10_pref_nap, - ehi10_pref_travel - - ehi10_pers_extravert, ehi10_pers_critical, ehi10_pers_dependable, - ehi10_pers_anxious, ehi10_pers_complex - - ehi10_val_obey, ehi10_val_trad, ehi10_val_opinion, ehi10_val_performance, - ehi10_val_justice - - 5-10 Year Change EHI Variables (15 variables): - - ehi5.10_pref_read, ehi5.10_pref_music, ehi5.10_pref_TV, ehi5.10_pref_nap, - ehi5.10_pref_travel - - ehi5.10_pers_extravert, ehi5.10_pers_critical, ehi5.10_pers_dependable, - ehi5.10_pers_anxious, ehi5.10_pers_complex - - ehi5.10_val_obey, ehi5.10_val_trad, ehi5.10_val_opinion, - ehi5.10_val_performance, ehi5.10_val_justice - -TRANSFORMATION LOGIC: - Formula: NPast - NFut - - All calculations use signed differences: - - ehi5_[item] = NPast_5_[item] - NFut_5_[item] - - ehi10_[item] = NPast_10_[item] - NFut_10_[item] - - ehi5.10_[item] = X5.10past_[item] - X5.10fut_[item] - - Result: Positive = greater past change, Negative = greater future change - Missing values in either source column result in NA - -QUALITY ASSURANCE: - - Comprehensive QA checks for all 45 variables across all rows - - First 5 rows displayed with detailed calculations showing source values, - computed differences, and stored values - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 12: dataP 12 - CORRECT DGEN ehi vars.r -================================================================================ - -PURPOSE: - Creates domain-general EHI variables by calculating differences between Past - and Future DGEN responses. These are the domain-general parallel to Script 11's - domain-specific EHI variables. - -VARIABLES CREATED: 6 total (3 domains × 2 time intervals) - -SOURCE COLUMNS: - - DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val - - DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val - - DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val - - DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -TARGET VARIABLES: - 5-Year DGEN EHI (3 variables): - - ehiDGEN_5_Pref - - ehiDGEN_5_Pers - - ehiDGEN_5_Val - - 10-Year DGEN EHI (3 variables): - - ehiDGEN_10_Pref - - ehiDGEN_10_Pers - - ehiDGEN_10_Val - -TRANSFORMATION LOGIC: - Formula: DGEN_past - DGEN_fut - - All calculations use signed differences: - - ehiDGEN_5_Pref = DGEN_past_5_Pref - DGEN_fut_5_Pref - - ehiDGEN_5_Pers = DGEN_past_5_Pers - DGEN_fut_5_Pers - - ehiDGEN_5_Val = DGEN_past_5_Val - DGEN_fut_5_Val - - ehiDGEN_10_Pref = DGEN_past_10_Pref - DGEN_fut_10_Pref - - ehiDGEN_10_Pers = DGEN_past_10_Pers - DGEN_fut_10_Pers - - ehiDGEN_10_Val = DGEN_past_10_Val - DGEN_fut_10_Val - - Result: Positive = greater past change, Negative = greater future change - -QUALITY ASSURANCE: - - QA checks for all 6 variables across all rows - - First 5 rows displayed with detailed calculations - - Pass/Fail status for each variable reported - - -================================================================================ -SCRIPT 13: datap 13 - ehi domain specific means.r -================================================================================ - -PURPOSE: - Calculates domain-level mean EHI scores by averaging the 5 items within each - domain (Preferences, Personality, Values) for each time interval. - -VARIABLES CREATED: 9 total (3 domains × 3 time intervals) - -SOURCE COLUMNS: - - ehi5_pref_read through ehi5_val_justice (15 columns) - - ehi10_pref_read through ehi10_val_justice (15 columns) - - ehi5.10_pref_read through ehi5.10_val_justice (15 columns) - -TARGET VARIABLES: - 5-Year Domain Means (3 variables): - - ehi5_pref_MEAN (mean of 5 preference items) - - ehi5_pers_MEAN (mean of 5 personality items) - - ehi5_val_MEAN (mean of 5 values items) - - 10-Year Domain Means (3 variables): - - ehi10_pref_MEAN - - ehi10_pers_MEAN - - ehi10_val_MEAN - - 5-10 Year Change Domain Means (3 variables): - - ehi5.10_pref_MEAN - - ehi5.10_pers_MEAN - - ehi5.10_val_MEAN - -TRANSFORMATION LOGIC: - Each domain mean = average of 5 items within that domain - - Example for ehi5_pref_MEAN: - = mean(ehi5_pref_read, ehi5_pref_music, ehi5_pref_TV, - ehi5_pref_nap, ehi5_pref_travel) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - Comprehensive QA for all 9 variables across all rows - - First 5 rows displayed for multiple domain means - - Pass/Fail status for each variable - - -================================================================================ -SCRIPT 14: datap 14 - all ehi global means.r -================================================================================ - -PURPOSE: - Calculates global EHI means by averaging domain-level means. Creates the - highest-level summary scores for EHI across both domain-general and - domain-specific measures. - -VARIABLES CREATED: 5 total - -SOURCE COLUMNS: - - ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val - - ehiDGEN_10_Pref, ehiDGEN_10_Pers, ehiDGEN_10_Val - - ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN - - ehi10_pref_MEAN, ehi10_pers_MEAN, ehi10_val_MEAN - - ehi5.10_pref_MEAN, ehi5.10_pers_MEAN, ehi5.10_val_MEAN - -TARGET VARIABLES: - DGEN Global Means (2 variables): - - ehiDGEN_5_mean (mean of 3 DGEN domains for 5-year) - - ehiDGEN_10_mean (mean of 3 DGEN domains for 10-year) - - Domain-Specific Global Means (3 variables): - - ehi5_global_mean (mean of 3 domain means for 5-year) - - ehi10_global_mean (mean of 3 domain means for 10-year) - - ehi5.10_global_mean (mean of 3 domain means for 5-10 change) - -TRANSFORMATION LOGIC: - Each global mean = average of 3 domain-level scores - - Example for ehiDGEN_5_mean: - = mean(ehiDGEN_5_Pref, ehiDGEN_5_Pers, ehiDGEN_5_Val) - - Example for ehi5_global_mean: - = mean(ehi5_pref_MEAN, ehi5_pers_MEAN, ehi5_val_MEAN) - - NA values excluded from calculation (na.rm = TRUE) - -QUALITY ASSURANCE: - - QA for all 5 global means across all rows - - First 5 rows displayed with detailed calculations - - Values shown with 5 decimal precision - - Pass/Fail status for each variable - - -================================================================================ -SCRIPT 15: datap 15 - education recoded ordinal 3.r -================================================================================ - -PURPOSE: - Recodes raw education categories (`demo_edu`) into an ordered 3-level factor - for analyses requiring an ordinal education variable. - -VARIABLES CREATED: 1 total - -SOURCE COLUMNS: - - demo_edu - -TARGET VARIABLES: - - edu3 (ordered factor with 3 levels) - -TRANSFORMATION LOGIC: - Map `demo_edu` to 3 ordered levels and store as an ordered factor: - - "HS_TS": High School (or equivalent), Trade School (non-military) - - "C_Ug": College Diploma/Certificate, University - Undergraduate - - "grad_prof": University - Graduate (Masters), University - PhD, Professional Degree (ex. JD/MD) - - Levels and order: - edu3 = factor(edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) - -QUALITY ASSURANCE: - - Prints frequency table for `edu3` and a cross-tab of `demo_edu` × `edu3` to - verify correct mapping and absence of unintended NAs. - - Saves updated dataset to `eohi2.csv`. - - -================================================================================ -SCRIPT 16: datap 16 - ehi vars standardized .r -================================================================================ - -PURPOSE: - Standardizes key EHI summary variables (z-scores) and creates a composite - standardized EHI mean (`stdEHI_mean`) for use in correlational and regression - analyses. - -VARIABLES CREATED: 5 total - -SOURCE COLUMNS: - - ehiDGEN_5_mean, ehiDGEN_10_mean - - ehi5_global_mean, ehi10_global_mean - -TARGET VARIABLES: - - stdDGEN_5 = z(ehiDGEN_5_mean) - - stdDGEN_10 = z(ehiDGEN_10_mean) - - stdDS_5 = z(ehi5_global_mean) - - stdDS_10 = z(ehi10_global_mean) - - stdEHI_mean = mean(stdDGEN_5, stdDGEN_10, stdDS_5, stdDS_10), row-wise - -TRANSFORMATION LOGIC: - Standardize each source variable using sample mean and SD (na.rm = TRUE): - stdX = (X - mean(X)) / sd(X) - - Then compute row-wise average across the four standardized variables: - stdEHI_mean = rowMeans(cbind(stdDGEN_5, stdDGEN_10, stdDS_5, stdDS_10), - na.rm = TRUE) - -CHECKS/QA: - - Prints pre-standardization means/SDs and post-standardization means/SDs to - confirm ~0 mean and ~1 SD for each standardized variable (allowing for NAs). - - Spot-checks random rows by recomputing standardized values and comparing to - stored columns. - - Saves updated dataset to `eohi2.csv`. - - -================================================================================ -SUMMARY OF ALL CREATED VARIABLES -================================================================================ - -Total Variables Created: 291 - -By Script: - - Script 01: 60 variables (past/future recoded items) - - Script 02: 15 variables (present recoded items) - - Script 03: 12 variables (DGEN domain scores) - - Script 04: 4 variables (DGEN time period means) - - Script 05: 3 variables (AOT & CRT scales) - - Script 06: 90 variables (time interval differences) - - Script 07: 18 variables (domain means for differences) - - Script 08: 6 variables (DGEN 5-vs-10 differences) - - Script 09: 11 variables (interval × direction means) - - Script 10: 6 variables (DGEN combined means) - - Script 11: 45 variables (domain-specific EHI scores) - - Script 12: 6 variables (DGEN EHI scores) - - Script 13: 9 variables (EHI domain means) - - Script 14: 5 variables (EHI global means) - - Script 15: 1 variable (education ordinal factor) - - Script 16: 5 variables (standardized EHI summaries and composite) - -By Category: - - Time Period Items (75 total): - * Present: 15 items - * Past 5: 15 items - * Past 10: 15 items - * Future 5: 15 items - * Future 10: 15 items - - - DGEN Variables (28 total): - * Domain scores: 12 (3 domains × 4 time periods) - * Time period means: 4 (1 per time period) - * 5-vs-10 differences: 6 (3 domains × 2 directions) - * Combined means: 6 (past, future, interval-based, domain-based) - - - Cognitive Scales (3 total): - * AOT total - * CRT correct - * CRT intuitive - - - Time Differences (90 total): - * NPast_5: 15 differences - * NPast_10: 15 differences - * NFut_5: 15 differences - * NFut_10: 15 differences - * 5.10past: 15 differences - * 5.10fut: 15 differences - - - Domain Means for Differences (18 total): - * NPast_5: 3 domain means - * NPast_10: 3 domain means - * NFut_5: 3 domain means - * NFut_10: 3 domain means - * 5.10past: 3 domain means - * 5.10fut: 3 domain means - - - Interval × Direction Means (11 total): - * Narrow-scope means: 6 (NPast_5, NPast_10, NFut_5, NFut_10, - X5.10past, X5.10fut) - * Global-scope means: 5 (NPast_global, NFut_global, X5.10_global, - N5_global, N10_global) - - - EHI Variables (60 total): - * Domain-specific EHI: 45 (15 items × 3 time intervals) - * DGEN EHI: 6 (3 domains × 2 time intervals) - * Domain means: 9 (3 domains × 3 time intervals) - * Global means: 5 (2 DGEN + 3 domain-specific) - - Standardized EHI Variables (5 total): - * stdDGEN_5, stdDGEN_10, stdDS_5, stdDS_10, stdEHI_mean - - -================================================================================ -DATA PROCESSING NOTES -================================================================================ - -1. PROCESSING ORDER: - Scripts MUST be run in numerical order (01 → 16) as later scripts depend - on variables created by earlier scripts. - - Key Dependencies: - - Script 03 required before Script 04, 08, 10, 12 (DGEN scores) - - Script 04 required before Script 10 (DGEN time period means) - - Script 06 required before Script 07, 09, 11 (time interval differences) - - Script 11 required before Script 13 (domain-specific EHI items) - - Script 12 required before Script 14 (DGEN EHI scores) - - Script 13 required before Script 14 (EHI domain means) - - Script 14 required before Script 16 (uses ehiDGEN_5/10_mean, ehi5/10_global_mean) - - Script 15 can run anytime after raw `demo_edu` is present; run before - analyses needing `edu3` - -2. SURVEY VERSION HANDLING: - - Two survey versions (01 and 02) were used - - Scripts 01 and 03 combine these versions - - Preference given to version 01 when both exist - -3. MISSING DATA: - - Empty cells and NA values are preserved throughout processing - - Calculations use na.rm=TRUE to exclude missing values from means - - Difference calculations result in NA if either source value is missing - -4. QUALITY ASSURANCE: - - Each script includes QA checks with row verification - - Manual calculation checks confirm proper transformations - - Column existence checks prevent errors from missing source data - - Scripts 09-16 include comprehensive QA with first 5 rows displayed - - All EHI scripts (11-14, 16) verify calculations against stored values - - Pass/Fail status reported for all variables in QA-enabled scripts - -5. FILE SAVING: - - Most scripts save directly to eohi2.csv - - Scripts 04, 06, and 07 have commented-out write commands for review - - Scripts 08 and 09 save directly to eohi2.csv - - Each script overwrites existing target columns if present - -6. SPECIAL NAMING CONVENTIONS: - - "pref_tv" vs "pref_TV" inconsistency maintained from source data - - DGEN variables use underscores (DGEN_past_5_Pref) - - Difference variables use descriptive prefixes (NPast_5_, 5.10past_) - - "X" prefix added to variables starting with numbers (X5.10past_mean) - - Global means use "_global_" to distinguish from narrow-scope means - - -================================================================================ -ITEM REFERENCE GUIDE -================================================================================ - -15 Core Items (Used across all time periods): - -PREFERENCES (5 items): - 1. pref_read - Reading preferences - 2. pref_music - Music preferences - 3. pref_TV/tv - TV watching preferences (note case variation) - 4. pref_nap - Napping preferences - 5. pref_travel - Travel preferences - -PERSONALITY (5 items): - 6. pers_extravert - Extraverted personality - 7. pers_critical - Critical thinking personality - 8. pers_dependable - Dependable personality - 9. pers_anxious - Anxious personality - 10. pers_complex - Complex personality - -VALUES (5 items): - 11. val_obey - Value of obedience - 12. val_trad - Value of tradition - 13. val_opinion - Value of expressing opinions - 14. val_performance - Value of performance - 15. val_justice - Value of justice - - -================================================================================ -EHI CONCEPT AND INTERPRETATION -================================================================================ - -ENDURING HEDONIC IMPACT (EHI): - EHI measures the asymmetry between perceived past and future change in - psychological attributes. The concept is based on the premise that people - may perceive their past and future selves differently, even when considering - equivalent time distances. - -KEY EHI VARIABLES: - - Domain-Specific EHI (Scripts 11, 13, 14): - Calculated from item-level differences between past and future responses - Formula: NPast - NFut - * Positive values: Greater perceived change from past to present - * Negative values: Greater perceived change from present to future - * Zero: Symmetric perception of past and future change - - - Domain-General EHI (Scripts 12, 14): - Calculated from DGEN single-item responses - Formula: DGEN_past - DGEN_fut - * Measures broader temporal self-perception without item-level detail - -HIERARCHICAL STRUCTURE: - Level 1: Item-level EHI (45 domain-specific, 6 DGEN) - Level 2: Domain means (9 domain-specific, combining 5 items each) - Level 3: Global means (5 highest-level summaries) - -INTERPRETATION: - - EHI > 0: "Past asymmetry" - Person perceives greater change from past - - EHI < 0: "Future asymmetry" - Person perceives greater change to future - - EHI ≈ 0: "Temporal symmetry" - Balanced perception of past/future change - - -================================================================================ -END OF DOCUMENTATION -================================================================================ -Last Updated: October 29, 2025 -Processing Pipeline: Scripts 01-16 - diff --git a/.history/eohi2/RMD - mixed anova DGEN_20251003190744.rmd b/.history/eohi2/RMD - mixed anova DGEN_20251003190744.rmd deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi2/RMD - mixed anova DGEN_20251006125956.rmd b/.history/eohi2/RMD - mixed anova DGEN_20251006125956.rmd deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi2/correlation matrix 2 - std ehi_20251029124228.r b/.history/eohi2/correlation matrix 2 - std ehi_20251029124228.r deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi2/correlation matrix 2 - std ehi_20251029124229.r b/.history/eohi2/correlation matrix 2 - std ehi_20251029124229.r deleted file mode 100644 index 755cd29..0000000 --- a/.history/eohi2/correlation matrix 2 - std ehi_20251029124229.r +++ /dev/null @@ -1,100 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -df <- read.csv("eohi2.csv") - -data <- df %>% - select(ehiDGEN_5_mean, ehiDGEN_10_mean, ehi5_global_mean, ehi10_global_mean, demo_sex, demo_age_1, edu3, aot_total, crt_correct, crt_int) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(ehiDGEN_5_mean, ehiDGEN_10_mean, ehi5_global_mean, ehi10_global_mean, sex_dummy, demo_age_1, edu_num, aot_total, crt_correct, crt_int) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print(round(cor_matrix, 3)) - -# Get significance tests for correlations using psych package -library(psych) - -# Create correlation matrix with significance tests -cor_test <- corr.test(numeric_vars, method = "spearman", adjust = "none") - -# Print correlation matrix -print(round(cor_test$r, 3)) - -# Print p-values -print(round(cor_test$p, 3)) - -# Print all correlations with r and p values (for reporting) -for(i in 1:nrow(cor_test$r)) { - for(j in 1:ncol(cor_test$r)) { - if(i != j) { # Skip diagonal - cat(colnames(numeric_vars)[i], "vs", colnames(numeric_vars)[j], - ": r =", round(cor_test$r[i, j], 3), - ", p =", round(cor_test$p[i, j], 3), "\n") - } - } -} - -# Also print significant correlations summary -sig_cors <- which(cor_test$p < 0.05 & cor_test$p != 0, arr.ind = TRUE) -if(nrow(sig_cors) > 0) { - for(i in 1:nrow(sig_cors)) { - row_idx <- sig_cors[i, 1] - col_idx <- sig_cors[i, 2] - if(row_idx != col_idx) { # Skip diagonal - cat(colnames(numeric_vars)[row_idx], "vs", colnames(numeric_vars)[col_idx], - ": r =", round(cor_test$r[row_idx, col_idx], 3), - ", p =", round(cor_test$p[row_idx, col_idx], 3), "\n") - } - } -} - -# Save correlation matrix and p-values to CSV files -write.csv(cor_test$r, "correlation_matrix.csv", row.names = TRUE) -write.csv(cor_test$p, "correlation_pvalues.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") -print("P-values saved to correlation_pvalues.csv") \ No newline at end of file diff --git a/.history/eohi2/correlation matrix 2 - std ehi_20251029124329.r b/.history/eohi2/correlation matrix 2 - std ehi_20251029124329.r deleted file mode 100644 index 4e5e2c6..0000000 --- a/.history/eohi2/correlation matrix 2 - std ehi_20251029124329.r +++ /dev/null @@ -1,100 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -df <- read.csv("eohi2.csv") - -data <- df %>% - select(stdEHI_mean, demo_sex, demo_age_1, edu3, aot_total, crt_correct, crt_int) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(stdEHI_mean, sex_dummy, demo_age_1, edu_num, aot_total, crt_correct, crt_int) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print(round(cor_matrix, 3)) - -# Get significance tests for correlations using psych package -library(psych) - -# Create correlation matrix with significance tests -cor_test <- corr.test(numeric_vars, method = "spearman", adjust = "none") - -# Print correlation matrix -print(round(cor_test$r, 3)) - -# Print p-values -print(round(cor_test$p, 3)) - -# Print all correlations with r and p values (for reporting) -for(i in 1:nrow(cor_test$r)) { - for(j in 1:ncol(cor_test$r)) { - if(i != j) { # Skip diagonal - cat(colnames(numeric_vars)[i], "vs", colnames(numeric_vars)[j], - ": r =", round(cor_test$r[i, j], 3), - ", p =", round(cor_test$p[i, j], 3), "\n") - } - } -} - -# Also print significant correlations summary -sig_cors <- which(cor_test$p < 0.05 & cor_test$p != 0, arr.ind = TRUE) -if(nrow(sig_cors) > 0) { - for(i in 1:nrow(sig_cors)) { - row_idx <- sig_cors[i, 1] - col_idx <- sig_cors[i, 2] - if(row_idx != col_idx) { # Skip diagonal - cat(colnames(numeric_vars)[row_idx], "vs", colnames(numeric_vars)[col_idx], - ": r =", round(cor_test$r[row_idx, col_idx], 3), - ", p =", round(cor_test$p[row_idx, col_idx], 3), "\n") - } - } -} - -# Save correlation matrix and p-values to CSV files -write.csv(cor_test$r, "STD_EHI_correlation_matrix.csv", row.names = TRUE) -write.csv(cor_test$p, "STD_EHI_correlation_pvalues.csv", row.names = TRUE) -print("Correlation matrix saved to STD_EHI_correlation_matrix.csv") -print("P-values saved to STD_EHI_correlation_pvalues.csv") \ No newline at end of file diff --git a/.history/eohi2/correlations - domain general vars_20251008122234.r b/.history/eohi2/correlations - domain general vars_20251008122234.r deleted file mode 100644 index 37bfde1..0000000 --- a/.history/eohi2/correlations - domain general vars_20251008122234.r +++ /dev/null @@ -1,184 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp2_data <- read.csv("eohi2.csv") - -# Define the two sets of variables -set1_vars <- c("DGEN_past_5_mean", "DGEN_past_10_mean", "DGEN_fut_5_mean", "DGEN_fut_10_mean", - "DGEN_past_5.10_mean", "DGEN_fut_5.10_mean", "DGENpast_global_mean", - "DGENfut_global_mean", "DGEN_5_global_mean", "DGEN_10_global_mean", "DGEN_5.10_global_mean") -set2_vars <- c("aot_total", "crt_correct", "crt_int") - -# Create subset with only the variables of interest -correlation_data <- exp2_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots_domain_general_vars.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots_domain_general_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 21, cex = 0.6, bg = "lightblue", col = "black") - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.5f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots_domain_general_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 21, cex = 0.6, bg = "lightblue", col = "black") - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate Spearman correlation matrix only -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Print correlation matrix with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot for Spearman only -pdf("correlation_plot_domain_general_vars_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: Domain-General Vars vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations_domain_general_vars.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics_domain_general_vars.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlations - domain general vars.csv", row.names = FALSE) diff --git a/.history/eohi2/correlations - domain general vars_20251008122239.r b/.history/eohi2/correlations - domain general vars_20251008122239.r deleted file mode 100644 index 37bfde1..0000000 --- a/.history/eohi2/correlations - domain general vars_20251008122239.r +++ /dev/null @@ -1,184 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp2_data <- read.csv("eohi2.csv") - -# Define the two sets of variables -set1_vars <- c("DGEN_past_5_mean", "DGEN_past_10_mean", "DGEN_fut_5_mean", "DGEN_fut_10_mean", - "DGEN_past_5.10_mean", "DGEN_fut_5.10_mean", "DGENpast_global_mean", - "DGENfut_global_mean", "DGEN_5_global_mean", "DGEN_10_global_mean", "DGEN_5.10_global_mean") -set2_vars <- c("aot_total", "crt_correct", "crt_int") - -# Create subset with only the variables of interest -correlation_data <- exp2_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots_domain_general_vars.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots_domain_general_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 21, cex = 0.6, bg = "lightblue", col = "black") - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.5f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots_domain_general_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 21, cex = 0.6, bg = "lightblue", col = "black") - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate Spearman correlation matrix only -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Print correlation matrix with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot for Spearman only -pdf("correlation_plot_domain_general_vars_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: Domain-General Vars vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations_domain_general_vars.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics_domain_general_vars.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlations - domain general vars.csv", row.names = FALSE) diff --git a/.history/eohi2/correlations - domain general vars_20251008122254.r b/.history/eohi2/correlations - domain general vars_20251008122254.r deleted file mode 100644 index 37bfde1..0000000 --- a/.history/eohi2/correlations - domain general vars_20251008122254.r +++ /dev/null @@ -1,184 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp2_data <- read.csv("eohi2.csv") - -# Define the two sets of variables -set1_vars <- c("DGEN_past_5_mean", "DGEN_past_10_mean", "DGEN_fut_5_mean", "DGEN_fut_10_mean", - "DGEN_past_5.10_mean", "DGEN_fut_5.10_mean", "DGENpast_global_mean", - "DGENfut_global_mean", "DGEN_5_global_mean", "DGEN_10_global_mean", "DGEN_5.10_global_mean") -set2_vars <- c("aot_total", "crt_correct", "crt_int") - -# Create subset with only the variables of interest -correlation_data <- exp2_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots_domain_general_vars.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots_domain_general_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 21, cex = 0.6, bg = "lightblue", col = "black") - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.5f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots_domain_general_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 21, cex = 0.6, bg = "lightblue", col = "black") - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate Spearman correlation matrix only -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Print correlation matrix with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot for Spearman only -pdf("correlation_plot_domain_general_vars_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: Domain-General Vars vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations_domain_general_vars.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics_domain_general_vars.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlations - domain general vars.csv", row.names = FALSE) diff --git a/.history/eohi2/correlations - domain general vars_20251008122540.r b/.history/eohi2/correlations - domain general vars_20251008122540.r deleted file mode 100644 index e56d8aa..0000000 --- a/.history/eohi2/correlations - domain general vars_20251008122540.r +++ /dev/null @@ -1,175 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp2_data <- read.csv("eohi2.csv") - -# Define the two sets of variables -set1_vars <- c("DGEN_past_5_mean", "DGEN_past_10_mean", "DGEN_fut_5_mean", "DGEN_fut_10_mean", - "DGEN_past_5.10_mean", "DGEN_fut_5.10_mean", "DGENpast_global_mean", - "DGENfut_global_mean", "DGEN_5_global_mean", "DGEN_10_global_mean", "DGEN_5.10_global_mean") -set2_vars <- c("aot_total", "crt_correct", "crt_int") - -# Create subset with only the variables of interest -correlation_data <- exp2_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - - -# Visual normality checks -pdf("normality_plots_domain_general_vars.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots_domain_general_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 21, cex = 0.6, bg = "lightblue", col = "black") - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.5f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots_domain_general_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 21, cex = 0.6, bg = "lightblue", col = "black") - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate Spearman correlation matrix only -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Print correlation matrix with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot for Spearman only -pdf("correlation_plot_domain_general_vars_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: Domain-General Vars vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations_domain_general_vars.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics_domain_general_vars.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlations - domain general vars.csv", row.names = FALSE) diff --git a/.history/eohi2/correlations - domain general vars_20251008122543.r b/.history/eohi2/correlations - domain general vars_20251008122543.r deleted file mode 100644 index e56d8aa..0000000 --- a/.history/eohi2/correlations - domain general vars_20251008122543.r +++ /dev/null @@ -1,175 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp2_data <- read.csv("eohi2.csv") - -# Define the two sets of variables -set1_vars <- c("DGEN_past_5_mean", "DGEN_past_10_mean", "DGEN_fut_5_mean", "DGEN_fut_10_mean", - "DGEN_past_5.10_mean", "DGEN_fut_5.10_mean", "DGENpast_global_mean", - "DGENfut_global_mean", "DGEN_5_global_mean", "DGEN_10_global_mean", "DGEN_5.10_global_mean") -set2_vars <- c("aot_total", "crt_correct", "crt_int") - -# Create subset with only the variables of interest -correlation_data <- exp2_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - - -# Visual normality checks -pdf("normality_plots_domain_general_vars.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots_domain_general_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 21, cex = 0.6, bg = "lightblue", col = "black") - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.5f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots_domain_general_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 21, cex = 0.6, bg = "lightblue", col = "black") - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate Spearman correlation matrix only -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Print correlation matrix with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot for Spearman only -pdf("correlation_plot_domain_general_vars_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: Domain-General Vars vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations_domain_general_vars.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics_domain_general_vars.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlations - domain general vars.csv", row.names = FALSE) diff --git a/.history/eohi2/correlations - domain general vars_20251008122553.r b/.history/eohi2/correlations - domain general vars_20251008122553.r deleted file mode 100644 index e56d8aa..0000000 --- a/.history/eohi2/correlations - domain general vars_20251008122553.r +++ /dev/null @@ -1,175 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp2_data <- read.csv("eohi2.csv") - -# Define the two sets of variables -set1_vars <- c("DGEN_past_5_mean", "DGEN_past_10_mean", "DGEN_fut_5_mean", "DGEN_fut_10_mean", - "DGEN_past_5.10_mean", "DGEN_fut_5.10_mean", "DGENpast_global_mean", - "DGENfut_global_mean", "DGEN_5_global_mean", "DGEN_10_global_mean", "DGEN_5.10_global_mean") -set2_vars <- c("aot_total", "crt_correct", "crt_int") - -# Create subset with only the variables of interest -correlation_data <- exp2_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - - -# Visual normality checks -pdf("normality_plots_domain_general_vars.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots_domain_general_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 21, cex = 0.6, bg = "lightblue", col = "black") - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.5f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots_domain_general_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 21, cex = 0.6, bg = "lightblue", col = "black") - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate Spearman correlation matrix only -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Print correlation matrix with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot for Spearman only -pdf("correlation_plot_domain_general_vars_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: Domain-General Vars vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations_domain_general_vars.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics_domain_general_vars.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlations - domain general vars.csv", row.names = FALSE) diff --git a/.history/eohi2/correlations - domain general vars_20251008122555.r b/.history/eohi2/correlations - domain general vars_20251008122555.r deleted file mode 100644 index e56d8aa..0000000 --- a/.history/eohi2/correlations - domain general vars_20251008122555.r +++ /dev/null @@ -1,175 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp2_data <- read.csv("eohi2.csv") - -# Define the two sets of variables -set1_vars <- c("DGEN_past_5_mean", "DGEN_past_10_mean", "DGEN_fut_5_mean", "DGEN_fut_10_mean", - "DGEN_past_5.10_mean", "DGEN_fut_5.10_mean", "DGENpast_global_mean", - "DGENfut_global_mean", "DGEN_5_global_mean", "DGEN_10_global_mean", "DGEN_5.10_global_mean") -set2_vars <- c("aot_total", "crt_correct", "crt_int") - -# Create subset with only the variables of interest -correlation_data <- exp2_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - - -# Visual normality checks -pdf("normality_plots_domain_general_vars.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots_domain_general_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 21, cex = 0.6, bg = "lightblue", col = "black") - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.5f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots_domain_general_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 21, cex = 0.6, bg = "lightblue", col = "black") - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate Spearman correlation matrix only -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Print correlation matrix with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot for Spearman only -pdf("correlation_plot_domain_general_vars_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: Domain-General Vars vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations_domain_general_vars.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics_domain_general_vars.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlations - domain general vars.csv", row.names = FALSE) diff --git a/.history/eohi2/correlations - domain specific vars_20251008115022.r b/.history/eohi2/correlations - domain specific vars_20251008115022.r deleted file mode 100644 index 545e8f3..0000000 --- a/.history/eohi2/correlations - domain specific vars_20251008115022.r +++ /dev/null @@ -1,197 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp2_data <- read.csv("eohi2.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_5_mean", "NPast_10_mean", "NFut_5_mean", "NFut_10_mean", - "X5.10past_mean", "X5.10fut_mean", "NPast_global_mean", - "Nfut_global_mean", "X5.10_global_mean", "N5_global_mean", "N10_global_mean") -set2_vars <- c("aot_total", "crt_correct", "crt_int") - -# Create subset with only the variables of interest -correlation_data <- exp2_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots_domain_vars.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots_domain_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 21, cex = 0.6, bg = "lightblue", col = "black") - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.5f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots_domain_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 21, cex = 0.6, bg = "lightblue", col = "black") - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_domain_vars_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: Domain-Specific Vars vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_domain_vars_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: Domain-Specific Vars vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations_domain_vars.csv") -write.csv(round(cor_matrix_pearson, 5), "pearson_correlations_domain_vars.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics_domain_vars.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlations - domain specific vars.csv", row.names = FALSE) - diff --git a/.history/eohi2/correlations - domain specific vars_20251008115035.r b/.history/eohi2/correlations - domain specific vars_20251008115035.r deleted file mode 100644 index 545e8f3..0000000 --- a/.history/eohi2/correlations - domain specific vars_20251008115035.r +++ /dev/null @@ -1,197 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp2_data <- read.csv("eohi2.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_5_mean", "NPast_10_mean", "NFut_5_mean", "NFut_10_mean", - "X5.10past_mean", "X5.10fut_mean", "NPast_global_mean", - "Nfut_global_mean", "X5.10_global_mean", "N5_global_mean", "N10_global_mean") -set2_vars <- c("aot_total", "crt_correct", "crt_int") - -# Create subset with only the variables of interest -correlation_data <- exp2_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots_domain_vars.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots_domain_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 21, cex = 0.6, bg = "lightblue", col = "black") - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.5f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots_domain_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 21, cex = 0.6, bg = "lightblue", col = "black") - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_domain_vars_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: Domain-Specific Vars vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_domain_vars_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: Domain-Specific Vars vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations_domain_vars.csv") -write.csv(round(cor_matrix_pearson, 5), "pearson_correlations_domain_vars.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics_domain_vars.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlations - domain specific vars.csv", row.names = FALSE) - diff --git a/.history/eohi2/correlations - domain specific vars_20251008115036.r b/.history/eohi2/correlations - domain specific vars_20251008115036.r deleted file mode 100644 index 545e8f3..0000000 --- a/.history/eohi2/correlations - domain specific vars_20251008115036.r +++ /dev/null @@ -1,197 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp2_data <- read.csv("eohi2.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_5_mean", "NPast_10_mean", "NFut_5_mean", "NFut_10_mean", - "X5.10past_mean", "X5.10fut_mean", "NPast_global_mean", - "Nfut_global_mean", "X5.10_global_mean", "N5_global_mean", "N10_global_mean") -set2_vars <- c("aot_total", "crt_correct", "crt_int") - -# Create subset with only the variables of interest -correlation_data <- exp2_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots_domain_vars.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots_domain_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 21, cex = 0.6, bg = "lightblue", col = "black") - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.5f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots_domain_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 21, cex = 0.6, bg = "lightblue", col = "black") - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_domain_vars_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: Domain-Specific Vars vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_domain_vars_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: Domain-Specific Vars vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations_domain_vars.csv") -write.csv(round(cor_matrix_pearson, 5), "pearson_correlations_domain_vars.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics_domain_vars.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlations - domain specific vars.csv", row.names = FALSE) - diff --git a/.history/eohi2/correlations - domain specific vars_20251008115149.r b/.history/eohi2/correlations - domain specific vars_20251008115149.r deleted file mode 100644 index cf25054..0000000 --- a/.history/eohi2/correlations - domain specific vars_20251008115149.r +++ /dev/null @@ -1,197 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp2_data <- read.csv("eohi2.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_5_mean", "NPast_10_mean", "NFut_5_mean", "NFut_10_mean", - "X5.10past_mean", "X5.10fut_mean", "NPast_global_mean", - "NFut_global_mean", "X5.10_global_mean", "N5_global_mean", "N10_global_mean") -set2_vars <- c("aot_total", "crt_correct", "crt_int") - -# Create subset with only the variables of interest -correlation_data <- exp2_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots_domain_vars.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots_domain_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 21, cex = 0.6, bg = "lightblue", col = "black") - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.5f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots_domain_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 21, cex = 0.6, bg = "lightblue", col = "black") - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_domain_vars_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: Domain-Specific Vars vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_domain_vars_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: Domain-Specific Vars vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations_domain_vars.csv") -write.csv(round(cor_matrix_pearson, 5), "pearson_correlations_domain_vars.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics_domain_vars.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlations - domain specific vars.csv", row.names = FALSE) - diff --git a/.history/eohi2/correlations - domain specific vars_20251008115152.r b/.history/eohi2/correlations - domain specific vars_20251008115152.r deleted file mode 100644 index cf25054..0000000 --- a/.history/eohi2/correlations - domain specific vars_20251008115152.r +++ /dev/null @@ -1,197 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp2_data <- read.csv("eohi2.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_5_mean", "NPast_10_mean", "NFut_5_mean", "NFut_10_mean", - "X5.10past_mean", "X5.10fut_mean", "NPast_global_mean", - "NFut_global_mean", "X5.10_global_mean", "N5_global_mean", "N10_global_mean") -set2_vars <- c("aot_total", "crt_correct", "crt_int") - -# Create subset with only the variables of interest -correlation_data <- exp2_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots_domain_vars.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots_domain_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 21, cex = 0.6, bg = "lightblue", col = "black") - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.5f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots_domain_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 21, cex = 0.6, bg = "lightblue", col = "black") - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_domain_vars_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: Domain-Specific Vars vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_domain_vars_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: Domain-Specific Vars vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations_domain_vars.csv") -write.csv(round(cor_matrix_pearson, 5), "pearson_correlations_domain_vars.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics_domain_vars.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlations - domain specific vars.csv", row.names = FALSE) - diff --git a/.history/eohi2/correlations - domain specific vars_20251008115154.r b/.history/eohi2/correlations - domain specific vars_20251008115154.r deleted file mode 100644 index cf25054..0000000 --- a/.history/eohi2/correlations - domain specific vars_20251008115154.r +++ /dev/null @@ -1,197 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp2_data <- read.csv("eohi2.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_5_mean", "NPast_10_mean", "NFut_5_mean", "NFut_10_mean", - "X5.10past_mean", "X5.10fut_mean", "NPast_global_mean", - "NFut_global_mean", "X5.10_global_mean", "N5_global_mean", "N10_global_mean") -set2_vars <- c("aot_total", "crt_correct", "crt_int") - -# Create subset with only the variables of interest -correlation_data <- exp2_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots_domain_vars.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots_domain_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 21, cex = 0.6, bg = "lightblue", col = "black") - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.5f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots_domain_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 21, cex = 0.6, bg = "lightblue", col = "black") - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_domain_vars_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: Domain-Specific Vars vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_domain_vars_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: Domain-Specific Vars vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations_domain_vars.csv") -write.csv(round(cor_matrix_pearson, 5), "pearson_correlations_domain_vars.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics_domain_vars.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlations - domain specific vars.csv", row.names = FALSE) - diff --git a/.history/eohi2/correlations - domain specific vars_20251008121216.r b/.history/eohi2/correlations - domain specific vars_20251008121216.r deleted file mode 100644 index cf25054..0000000 --- a/.history/eohi2/correlations - domain specific vars_20251008121216.r +++ /dev/null @@ -1,197 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp2_data <- read.csv("eohi2.csv") - -# Define the two sets of variables -set1_vars <- c("NPast_5_mean", "NPast_10_mean", "NFut_5_mean", "NFut_10_mean", - "X5.10past_mean", "X5.10fut_mean", "NPast_global_mean", - "NFut_global_mean", "X5.10_global_mean", "N5_global_mean", "N10_global_mean") -set2_vars <- c("aot_total", "crt_correct", "crt_int") - -# Create subset with only the variables of interest -correlation_data <- exp2_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality (n < 5000) -for(var in names(correlation_data)) { - if(length(na.omit(correlation_data[[var]])) <= 5000) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) - } -} - -# Kolmogorov-Smirnov test for normality -for(var in names(correlation_data)) { - ks_result <- ks.test(correlation_data[[var]], "pnorm", - mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)) - cat(sprintf("%s: KS p = %.5f %s\n", - var, ks_result$p.value, - ifelse(ks_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots_domain_vars.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots_domain_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 21, cex = 0.6, bg = "lightblue", col = "black") - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.5f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots_domain_vars.pdf", width = 15, height = 10) -par(mfrow = c(4, 3)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 21, cex = 0.6, bg = "lightblue", col = "black") - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrices (both Pearson and Spearman) -cor_matrix_pearson <- cor(correlation_data, method = "pearson") -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Use Spearman as primary method -cor_matrix <- cor_matrix_spearman - -# Print correlation matrices with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plots for both methods -pdf("correlation_plot_domain_vars_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: Domain-Specific Vars vs Cognitive Measures") -dev.off() - -pdf("correlation_plot_domain_vars_pearson.pdf", width = 10, height = 8) -corrplot(cor_matrix_pearson, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Pearson Correlation Matrix: Domain-Specific Vars vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations_domain_vars.csv") -write.csv(round(cor_matrix_pearson, 5), "pearson_correlations_domain_vars.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics_domain_vars.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlations - domain specific vars.csv", row.names = FALSE) - diff --git a/.history/eohi2/correlations CORRECT - ehi + DGEN x scales_20251008171931.r b/.history/eohi2/correlations CORRECT - ehi + DGEN x scales_20251008171931.r deleted file mode 100644 index 4e31353..0000000 --- a/.history/eohi2/correlations CORRECT - ehi + DGEN x scales_20251008171931.r +++ /dev/null @@ -1,176 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("eohi2.csv") - -# Define the two sets of variables -set1_vars <- c("ehiDGEN_5_Pref", "ehiDGEN_5_Pers", "ehiDGEN_5_Val", - "ehiDGEN_10_Pref", "ehiDGEN_10_Pers", "ehiDGEN_10_Val", - "ehi5_pref_MEAN", "ehi5_pers_MEAN", "ehi5_val_MEAN", - "ehi10_pref_MEAN", "ehi10_pers_MEAN", "ehi10_val_MEAN", - "ehi5.10_pref_MEAN", "ehi5.10_pers_MEAN", "ehi5.10_val_MEAN", - "ehiDGEN_5_mean", "ehiDGEN_10_mean", - "ehi5_global_mean", "ehi10_global_mean", "ehi5.10_global_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality -for(var in names(correlation_data)) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrix (Spearman only) -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Print correlation matrix with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot (Spearman only) -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlation_exp1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi2/correlations CORRECT - ehi + DGEN x scales_20251008171942.r b/.history/eohi2/correlations CORRECT - ehi + DGEN x scales_20251008171942.r deleted file mode 100644 index 4e31353..0000000 --- a/.history/eohi2/correlations CORRECT - ehi + DGEN x scales_20251008171942.r +++ /dev/null @@ -1,176 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("eohi2.csv") - -# Define the two sets of variables -set1_vars <- c("ehiDGEN_5_Pref", "ehiDGEN_5_Pers", "ehiDGEN_5_Val", - "ehiDGEN_10_Pref", "ehiDGEN_10_Pers", "ehiDGEN_10_Val", - "ehi5_pref_MEAN", "ehi5_pers_MEAN", "ehi5_val_MEAN", - "ehi10_pref_MEAN", "ehi10_pers_MEAN", "ehi10_val_MEAN", - "ehi5.10_pref_MEAN", "ehi5.10_pers_MEAN", "ehi5.10_val_MEAN", - "ehiDGEN_5_mean", "ehiDGEN_10_mean", - "ehi5_global_mean", "ehi10_global_mean", "ehi5.10_global_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality -for(var in names(correlation_data)) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrix (Spearman only) -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Print correlation matrix with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot (Spearman only) -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlation_exp1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi2/correlations CORRECT - ehi + DGEN x scales_20251008171945.r b/.history/eohi2/correlations CORRECT - ehi + DGEN x scales_20251008171945.r deleted file mode 100644 index 4e31353..0000000 --- a/.history/eohi2/correlations CORRECT - ehi + DGEN x scales_20251008171945.r +++ /dev/null @@ -1,176 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("eohi2.csv") - -# Define the two sets of variables -set1_vars <- c("ehiDGEN_5_Pref", "ehiDGEN_5_Pers", "ehiDGEN_5_Val", - "ehiDGEN_10_Pref", "ehiDGEN_10_Pers", "ehiDGEN_10_Val", - "ehi5_pref_MEAN", "ehi5_pers_MEAN", "ehi5_val_MEAN", - "ehi10_pref_MEAN", "ehi10_pers_MEAN", "ehi10_val_MEAN", - "ehi5.10_pref_MEAN", "ehi5.10_pers_MEAN", "ehi5.10_val_MEAN", - "ehiDGEN_5_mean", "ehiDGEN_10_mean", - "ehi5_global_mean", "ehi10_global_mean", "ehi5.10_global_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality -for(var in names(correlation_data)) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrix (Spearman only) -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Print correlation matrix with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot (Spearman only) -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlation_exp1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi2/correlations CORRECT - ehi + DGEN x scales_20251008172011.r b/.history/eohi2/correlations CORRECT - ehi + DGEN x scales_20251008172011.r deleted file mode 100644 index 9a71216..0000000 --- a/.history/eohi2/correlations CORRECT - ehi + DGEN x scales_20251008172011.r +++ /dev/null @@ -1,176 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("eohi2.csv") - -# Define the two sets of variables -set1_vars <- c("ehiDGEN_5_Pref", "ehiDGEN_5_Pers", "ehiDGEN_5_Val", - "ehiDGEN_10_Pref", "ehiDGEN_10_Pers", "ehiDGEN_10_Val", - "ehi5_pref_MEAN", "ehi5_pers_MEAN", "ehi5_val_MEAN", - "ehi10_pref_MEAN", "ehi10_pers_MEAN", "ehi10_val_MEAN", - "ehi5.10_pref_MEAN", "ehi5.10_pers_MEAN", "ehi5.10_val_MEAN", - "ehiDGEN_5_mean", "ehiDGEN_10_mean", - "ehi5_global_mean", "ehi10_global_mean", "ehi5.10_global_mean") -set2_vars <- c("AOT_total", "CRT_correct", "CRT_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality -for(var in names(correlation_data)) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrix (Spearman only) -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Print correlation matrix with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot (Spearman only) -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlationCORRECT_exp2.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi2/correlations CORRECT - ehi + DGEN x scales_20251008172056.r b/.history/eohi2/correlations CORRECT - ehi + DGEN x scales_20251008172056.r deleted file mode 100644 index 2c5729e..0000000 --- a/.history/eohi2/correlations CORRECT - ehi + DGEN x scales_20251008172056.r +++ /dev/null @@ -1,176 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("eohi2.csv") - -# Define the two sets of variables -set1_vars <- c("ehiDGEN_5_Pref", "ehiDGEN_5_Pers", "ehiDGEN_5_Val", - "ehiDGEN_10_Pref", "ehiDGEN_10_Pers", "ehiDGEN_10_Val", - "ehi5_pref_MEAN", "ehi5_pers_MEAN", "ehi5_val_MEAN", - "ehi10_pref_MEAN", "ehi10_pers_MEAN", "ehi10_val_MEAN", - "ehi5.10_pref_MEAN", "ehi5.10_pers_MEAN", "ehi5.10_val_MEAN", - "ehiDGEN_5_mean", "ehiDGEN_10_mean", - "ehi5_global_mean", "ehi10_global_mean", "ehi5.10_global_mean") -set2_vars <- c("aot_total", "crt_correct", "crt_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality -for(var in names(correlation_data)) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrix (Spearman only) -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Print correlation matrix with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot (Spearman only) -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlationCORRECT_exp2.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi2/correlations CORRECT - ehi + DGEN x scales_20251008185510.r b/.history/eohi2/correlations CORRECT - ehi + DGEN x scales_20251008185510.r deleted file mode 100644 index 2c5729e..0000000 --- a/.history/eohi2/correlations CORRECT - ehi + DGEN x scales_20251008185510.r +++ /dev/null @@ -1,176 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load required libraries -library(corrplot) -library(Hmisc) -library(psych) - -# Load the data -exp1_data <- read.csv("eohi2.csv") - -# Define the two sets of variables -set1_vars <- c("ehiDGEN_5_Pref", "ehiDGEN_5_Pers", "ehiDGEN_5_Val", - "ehiDGEN_10_Pref", "ehiDGEN_10_Pers", "ehiDGEN_10_Val", - "ehi5_pref_MEAN", "ehi5_pers_MEAN", "ehi5_val_MEAN", - "ehi10_pref_MEAN", "ehi10_pers_MEAN", "ehi10_val_MEAN", - "ehi5.10_pref_MEAN", "ehi5.10_pers_MEAN", "ehi5.10_val_MEAN", - "ehiDGEN_5_mean", "ehiDGEN_10_mean", - "ehi5_global_mean", "ehi10_global_mean", "ehi5.10_global_mean") -set2_vars <- c("aot_total", "crt_correct", "crt_int") - -# Create subset with only the variables of interest -correlation_data <- exp1_data[, c(set1_vars, set2_vars)] - -# ===== NORMALITY CHECKS ===== -# Shapiro-Wilk tests for normality -for(var in names(correlation_data)) { - shapiro_result <- shapiro.test(correlation_data[[var]]) - cat(sprintf("%s: Shapiro-Wilk p = %.5f %s\n", - var, shapiro_result$p.value, - ifelse(shapiro_result$p.value < 0.05, "(NOT normal)", "(normal)"))) -} - -# Visual normality checks -pdf("normality_plots.pdf", width = 12, height = 8) -par(mfrow = c(2, 4)) -for(var in names(correlation_data)) { - # Histogram with normal curve overlay - hist(correlation_data[[var]], main = paste("Histogram:", var), - xlab = var, freq = FALSE) - curve(dnorm(x, mean = mean(correlation_data[[var]], na.rm = TRUE), - sd = sd(correlation_data[[var]], na.rm = TRUE)), - add = TRUE, col = "red", lwd = 2) - - # Q-Q plot - qqnorm(correlation_data[[var]], main = paste("Q-Q Plot:", var)) - qqline(correlation_data[[var]], col = "red", lwd = 2) -} -dev.off() - -# ===== LINEARITY CHECKS ===== -# Check linearity between variable pairs -pdf("linearity_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - # Scatter plot with regression line - plot(correlation_data[[var1]], correlation_data[[var2]], - main = paste(var1, "vs", var2), - xlab = var1, ylab = var2, pch = 16, cex = 0.6) - - # Add linear regression line - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - abline(lm_fit, col = "red", lwd = 2) - - # Add LOESS smooth line for non-linear pattern detection - loess_fit <- loess(correlation_data[[var2]] ~ correlation_data[[var1]]) - x_seq <- seq(min(correlation_data[[var1]], na.rm = TRUE), - max(correlation_data[[var1]], na.rm = TRUE), length = 100) - loess_pred <- predict(loess_fit, x_seq) - lines(x_seq, loess_pred, col = "blue", lwd = 2, lty = 2) - - # Calculate R-squared for linear fit - r_squared <- summary(lm_fit)$r.squared - cat(sprintf("%s vs %s: R² = %.4f\n", var1, var2, r_squared)) - } -} -dev.off() - -# Residual analysis for linearity -pdf("residual_plots.pdf", width = 15, height = 10) -par(mfrow = c(3, 5)) -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - - lm_fit <- lm(correlation_data[[var2]] ~ correlation_data[[var1]]) - residuals <- residuals(lm_fit) - fitted <- fitted(lm_fit) - - plot(fitted, residuals, - main = paste("Residuals:", var1, "vs", var2), - xlab = "Fitted Values", ylab = "Residuals", pch = 16, cex = 0.6) - abline(h = 0, col = "red", lwd = 2) - - # Add smooth line to residuals - lines(lowess(fitted, residuals), col = "blue", lwd = 2) - } -} -dev.off() - - -# Calculate correlation matrix (Spearman only) -cor_matrix_spearman <- cor(correlation_data, method = "spearman") - -# Print correlation matrix with 5 decimal places -print(round(cor_matrix_spearman, 5)) - -# Separate correlations between the two sets (Spearman) -set1_set2_cor <- cor_matrix_spearman[set1_vars, set2_vars] -print(round(set1_set2_cor, 5)) - -# Calculate correlations within each set (Spearman) -set1_within_cor <- cor_matrix_spearman[set1_vars, set1_vars] -set2_within_cor <- cor_matrix_spearman[set2_vars, set2_vars] - -# Statistical significance tests (Spearman) -cor_test_results_spearman <- rcorr(as.matrix(correlation_data), type = "spearman") - -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - p_val <- cor_test_results_spearman$P[var1, var2] - cat(sprintf("%s vs %s: p = %.5f\n", var1, var2, p_val)) - } -} - -# Create correlation plot (Spearman only) -pdf("correlation_plot_scales_spearman.pdf", width = 10, height = 8) -corrplot(cor_matrix_spearman, method = "color", type = "upper", - order = "hclust", tl.cex = 0.8, tl.col = "black", - addCoef.col = "black", number.cex = 0.7, - title = "Spearman Correlation Matrix: EOHI/DGEN vs Cognitive Measures") -dev.off() - -# Summary statistics -desc_stats <- describe(correlation_data) -print(round(desc_stats, 5)) - -# Save results to CSV files -write.csv(round(cor_matrix_spearman, 5), "spearman_correlations.csv") -write.csv(round(desc_stats, 5), "descriptive_statistics.csv") - -# Save correlation results in a formatted table -cor_results <- data.frame( - Variable1 = character(), - Variable2 = character(), - Spearman_r = numeric(), - P_value = numeric(), - stringsAsFactors = FALSE -) - -# Extract significant correlations between sets -for(i in 1:length(set1_vars)) { - for(j in 1:length(set2_vars)) { - var1 <- set1_vars[i] - var2 <- set2_vars[j] - r_val <- cor_matrix_spearman[var1, var2] - p_val <- cor_test_results_spearman$P[var1, var2] - - cor_results <- rbind(cor_results, data.frame( - Variable1 = var1, - Variable2 = var2, - Spearman_r = r_val, - P_value = p_val, - stringsAsFactors = FALSE - )) - } -} - -write.csv(cor_results, "correlationCORRECT_exp2.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi2/dataP - DGEN means_20251001122522.r b/.history/eohi2/dataP - DGEN means_20251001122522.r deleted file mode 100644 index 3d454c8..0000000 --- a/.history/eohi2/dataP - DGEN means_20251001122522.r +++ /dev/null @@ -1,183 +0,0 @@ -# Script to calculate DGEN means by time period in eohi2.csv -# Averages the 3 domain scores (Pref, Pers, Val) for each time period - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source columns (12 total) -source_cols <- c( - "DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val" -) - -# Define target columns (4 total) -target_cols <- c( - "DGEN_past_5_mean", - "DGEN_past_10_mean", - "DGEN_fut_5_mean", - "DGEN_fut_10_mean" -) - -# Define groupings: each target gets 3 source columns -source_groups <- list( - DGEN_past_5_mean = c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val"), - DGEN_past_10_mean = c("DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val"), - DGEN_fut_5_mean = c("DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val"), - DGEN_fut_10_mean = c("DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 4 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 6) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE MEANS ============= -cat("Calculating DGEN means by time period...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Calculate each target as the mean of its 3 source columns -for (target in target_cols) { - source_group <- source_groups[[target]] - - # Get the columns that exist - existing_cols <- source_group[source_group %in% names(df)] - - if (length(existing_cols) > 0) { - # Calculate row means across the 3 domain columns - df[[target]] <- rowMeans(df[, existing_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", target, "\n") - } else { - cat(" WARNING: No source columns found for", target, "\n") - } -} - -cat("\n=== CALCULATION COMPLETE ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 4 target columns - for (target in target_cols) { - source_group <- source_groups[[target]] - - cat(sprintf("Target: %s\n", target)) - cat(" Source columns:\n") - - # Get values from source columns - values <- numeric(3) - for (i in 1:3) { - col <- source_group[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - values[i] <- val - cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", as.character(val)))) - } - - # Calculate expected mean - valid_values <- values[!is.na(values)] - if (length(valid_values) > 0) { - expected_mean <- mean(valid_values) - actual_value <- df[random_row, target] - - cat(sprintf("\n Calculation:\n")) - cat(sprintf(" Sum: %s = %.5f\n", paste(valid_values, collapse = " + "), sum(valid_values))) - cat(sprintf(" Average of %d values: %.5f\n", length(valid_values), expected_mean)) - cat(sprintf(" Target value: %.5f\n", actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid values to calculate mean.\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 163 to save changes.\n") -cat("\nProcessing complete! 4 DGEN mean columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - DGEN means_20251001122534.r b/.history/eohi2/dataP - DGEN means_20251001122534.r deleted file mode 100644 index 3d454c8..0000000 --- a/.history/eohi2/dataP - DGEN means_20251001122534.r +++ /dev/null @@ -1,183 +0,0 @@ -# Script to calculate DGEN means by time period in eohi2.csv -# Averages the 3 domain scores (Pref, Pers, Val) for each time period - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source columns (12 total) -source_cols <- c( - "DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val" -) - -# Define target columns (4 total) -target_cols <- c( - "DGEN_past_5_mean", - "DGEN_past_10_mean", - "DGEN_fut_5_mean", - "DGEN_fut_10_mean" -) - -# Define groupings: each target gets 3 source columns -source_groups <- list( - DGEN_past_5_mean = c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val"), - DGEN_past_10_mean = c("DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val"), - DGEN_fut_5_mean = c("DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val"), - DGEN_fut_10_mean = c("DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 4 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 6) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE MEANS ============= -cat("Calculating DGEN means by time period...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Calculate each target as the mean of its 3 source columns -for (target in target_cols) { - source_group <- source_groups[[target]] - - # Get the columns that exist - existing_cols <- source_group[source_group %in% names(df)] - - if (length(existing_cols) > 0) { - # Calculate row means across the 3 domain columns - df[[target]] <- rowMeans(df[, existing_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", target, "\n") - } else { - cat(" WARNING: No source columns found for", target, "\n") - } -} - -cat("\n=== CALCULATION COMPLETE ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 4 target columns - for (target in target_cols) { - source_group <- source_groups[[target]] - - cat(sprintf("Target: %s\n", target)) - cat(" Source columns:\n") - - # Get values from source columns - values <- numeric(3) - for (i in 1:3) { - col <- source_group[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - values[i] <- val - cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", as.character(val)))) - } - - # Calculate expected mean - valid_values <- values[!is.na(values)] - if (length(valid_values) > 0) { - expected_mean <- mean(valid_values) - actual_value <- df[random_row, target] - - cat(sprintf("\n Calculation:\n")) - cat(sprintf(" Sum: %s = %.5f\n", paste(valid_values, collapse = " + "), sum(valid_values))) - cat(sprintf(" Average of %d values: %.5f\n", length(valid_values), expected_mean)) - cat(sprintf(" Target value: %.5f\n", actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid values to calculate mean.\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 163 to save changes.\n") -cat("\nProcessing complete! 4 DGEN mean columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - DGEN means_20251001122539.r b/.history/eohi2/dataP - DGEN means_20251001122539.r deleted file mode 100644 index 3d454c8..0000000 --- a/.history/eohi2/dataP - DGEN means_20251001122539.r +++ /dev/null @@ -1,183 +0,0 @@ -# Script to calculate DGEN means by time period in eohi2.csv -# Averages the 3 domain scores (Pref, Pers, Val) for each time period - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source columns (12 total) -source_cols <- c( - "DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val" -) - -# Define target columns (4 total) -target_cols <- c( - "DGEN_past_5_mean", - "DGEN_past_10_mean", - "DGEN_fut_5_mean", - "DGEN_fut_10_mean" -) - -# Define groupings: each target gets 3 source columns -source_groups <- list( - DGEN_past_5_mean = c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val"), - DGEN_past_10_mean = c("DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val"), - DGEN_fut_5_mean = c("DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val"), - DGEN_fut_10_mean = c("DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 4 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 6) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE MEANS ============= -cat("Calculating DGEN means by time period...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Calculate each target as the mean of its 3 source columns -for (target in target_cols) { - source_group <- source_groups[[target]] - - # Get the columns that exist - existing_cols <- source_group[source_group %in% names(df)] - - if (length(existing_cols) > 0) { - # Calculate row means across the 3 domain columns - df[[target]] <- rowMeans(df[, existing_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", target, "\n") - } else { - cat(" WARNING: No source columns found for", target, "\n") - } -} - -cat("\n=== CALCULATION COMPLETE ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 4 target columns - for (target in target_cols) { - source_group <- source_groups[[target]] - - cat(sprintf("Target: %s\n", target)) - cat(" Source columns:\n") - - # Get values from source columns - values <- numeric(3) - for (i in 1:3) { - col <- source_group[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - values[i] <- val - cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", as.character(val)))) - } - - # Calculate expected mean - valid_values <- values[!is.na(values)] - if (length(valid_values) > 0) { - expected_mean <- mean(valid_values) - actual_value <- df[random_row, target] - - cat(sprintf("\n Calculation:\n")) - cat(sprintf(" Sum: %s = %.5f\n", paste(valid_values, collapse = " + "), sum(valid_values))) - cat(sprintf(" Average of %d values: %.5f\n", length(valid_values), expected_mean)) - cat(sprintf(" Target value: %.5f\n", actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid values to calculate mean.\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 163 to save changes.\n") -cat("\nProcessing complete! 4 DGEN mean columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - DGEN means_20251001124312.r b/.history/eohi2/dataP - DGEN means_20251001124312.r deleted file mode 100644 index d72f308..0000000 --- a/.history/eohi2/dataP - DGEN means_20251001124312.r +++ /dev/null @@ -1,183 +0,0 @@ -# Script to calculate DGEN means by time period in eohi2.csv -# Averages the 3 domain scores (Pref, Pers, Val) for each time period - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source columns (12 total) -source_cols <- c( - "DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val" -) - -# Define target columns (4 total) -target_cols <- c( - "DGEN_past_5_mean", - "DGEN_past_10_mean", - "DGEN_fut_5_mean", - "DGEN_fut_10_mean" -) - -# Define groupings: each target gets 3 source columns -source_groups <- list( - DGEN_past_5_mean = c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val"), - DGEN_past_10_mean = c("DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val"), - DGEN_fut_5_mean = c("DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val"), - DGEN_fut_10_mean = c("DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 4 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 6) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE MEANS ============= -cat("Calculating DGEN means by time period...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Calculate each target as the mean of its 3 source columns -for (target in target_cols) { - source_group <- source_groups[[target]] - - # Get the columns that exist - existing_cols <- source_group[source_group %in% names(df)] - - if (length(existing_cols) > 0) { - # Calculate row means across the 3 domain columns - df[[target]] <- rowMeans(df[, existing_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", target, "\n") - } else { - cat(" WARNING: No source columns found for", target, "\n") - } -} - -cat("\n=== CALCULATION COMPLETE ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 4 target columns - for (target in target_cols) { - source_group <- source_groups[[target]] - - cat(sprintf("Target: %s\n", target)) - cat(" Source columns:\n") - - # Get values from source columns - values <- numeric(3) - for (i in 1:3) { - col <- source_group[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - values[i] <- val - cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", as.character(val)))) - } - - # Calculate expected mean - valid_values <- values[!is.na(values)] - if (length(valid_values) > 0) { - expected_mean <- mean(valid_values) - actual_value <- df[random_row, target] - - cat(sprintf("\n Calculation:\n")) - cat(sprintf(" Sum: %s = %.5f\n", paste(valid_values, collapse = " + "), sum(valid_values))) - cat(sprintf(" Average of %d values: %.5f\n", length(valid_values), expected_mean)) - cat(sprintf(" Target value: %.5f\n", actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid values to calculate mean.\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 163 to save changes.\n") -cat("\nProcessing complete! 4 DGEN mean columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - DGEN means_20251001130521.r b/.history/eohi2/dataP - DGEN means_20251001130521.r deleted file mode 100644 index 3d454c8..0000000 --- a/.history/eohi2/dataP - DGEN means_20251001130521.r +++ /dev/null @@ -1,183 +0,0 @@ -# Script to calculate DGEN means by time period in eohi2.csv -# Averages the 3 domain scores (Pref, Pers, Val) for each time period - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source columns (12 total) -source_cols <- c( - "DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val" -) - -# Define target columns (4 total) -target_cols <- c( - "DGEN_past_5_mean", - "DGEN_past_10_mean", - "DGEN_fut_5_mean", - "DGEN_fut_10_mean" -) - -# Define groupings: each target gets 3 source columns -source_groups <- list( - DGEN_past_5_mean = c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val"), - DGEN_past_10_mean = c("DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val"), - DGEN_fut_5_mean = c("DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val"), - DGEN_fut_10_mean = c("DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 4 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 6) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE MEANS ============= -cat("Calculating DGEN means by time period...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Calculate each target as the mean of its 3 source columns -for (target in target_cols) { - source_group <- source_groups[[target]] - - # Get the columns that exist - existing_cols <- source_group[source_group %in% names(df)] - - if (length(existing_cols) > 0) { - # Calculate row means across the 3 domain columns - df[[target]] <- rowMeans(df[, existing_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", target, "\n") - } else { - cat(" WARNING: No source columns found for", target, "\n") - } -} - -cat("\n=== CALCULATION COMPLETE ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 4 target columns - for (target in target_cols) { - source_group <- source_groups[[target]] - - cat(sprintf("Target: %s\n", target)) - cat(" Source columns:\n") - - # Get values from source columns - values <- numeric(3) - for (i in 1:3) { - col <- source_group[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - values[i] <- val - cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", as.character(val)))) - } - - # Calculate expected mean - valid_values <- values[!is.na(values)] - if (length(valid_values) > 0) { - expected_mean <- mean(valid_values) - actual_value <- df[random_row, target] - - cat(sprintf("\n Calculation:\n")) - cat(sprintf(" Sum: %s = %.5f\n", paste(valid_values, collapse = " + "), sum(valid_values))) - cat(sprintf(" Average of %d values: %.5f\n", length(valid_values), expected_mean)) - cat(sprintf(" Target value: %.5f\n", actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid values to calculate mean.\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 163 to save changes.\n") -cat("\nProcessing complete! 4 DGEN mean columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode DGEN vars_20251001100032.r b/.history/eohi2/dataP - recode DGEN vars_20251001100032.r deleted file mode 100644 index 82e358a..0000000 --- a/.history/eohi2/dataP - recode DGEN vars_20251001100032.r +++ /dev/null @@ -1,255 +0,0 @@ -# Script to combine DGEN variables in eohi2.csv -# Combines 01 and 02 versions of DGEN items (no recoding, just copying values) - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefDGEN_1", - "01past5PersDGEN_1", - "01past5ValDGEN_1", - "01past10PrefDGEN_1", - "01past10PersDGEN_1", - "01past10ValDGEN_1", - "01fut5PrefDGEN_8", - "01fut5PersDGEN_8", - "01fut5ValuesDGEN_1", - "01fut10PrefDGEN_8", - "01fut10PersDGEN_8", - "01fut10ValuesDGEN_1" -) - -source_cols_B <- c( - "02past5PrefDGEN_1", - "02past5PersDGEN_1", - "02past5ValDGEN_1", - "02past10PrefDGEN_1", - "02past10PersDGEN_1", - "02past10ValDGEN_1", - "02fut5PrefDGEN_8", - "02fut5PersDGEN_8", - "02fut5ValDGEN_1", - "02fut10PrefDGEN_8", - "02fut10PersDGEN_8", - "02fut10ValDGEN_1" -) - -# Define target column names -target_cols <- c( - "DGEN_past_5_Pref", - "DGEN_past_5_Pers", - "DGEN_past_5_Val", - "DGEN_past_10_Pref", - "DGEN_past_10_Pers", - "DGEN_past_10_Val", - "DGEN_fut_5_Pref", - "DGEN_fut_5_Pers", - "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", - "DGEN_fut_10_Pers", - "DGEN_fut_10_Val" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 6 || length(missing_B) > 6) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns (just copy values, no recoding) -for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - # No recoding - just copy the value directly - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Copy directly to target column (no recoding) - df[[target_col]] <- combined - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 12 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 12 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 12 pairs - for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values - val_A <- if (col_A %in% names(df)) df[random_row, col_A] else "" - val_B <- if (col_B %in% names(df)) df[random_row, col_B] else "" - target_val <- df[random_row, target_col] - - # Determine which source had the value - has_val_A <- !is.na(val_A) && val_A != "" - has_val_B <- !is.na(val_B) && val_B != "" - - if (has_val_A) { - source_used <- "A" - original_value <- val_A - } else if (has_val_B) { - source_used <- "B" - original_value <- val_B - } else { - source_used <- "NONE" - original_value <- "(empty)" - } - - # Print the info - cat(sprintf("Pair %2d:\n", i)) - cat(sprintf(" Source A: %-30s\n", col_A)) - cat(sprintf(" Source B: %-30s\n", col_B)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Value found in: Source %s\n", source_used)) - cat(sprintf(" Original value: '%s'\n", original_value)) - cat(sprintf(" Target value: '%s'\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 267 to save changes.\n") -cat("\nProcessing complete! 12 new columns created (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode DGEN vars_20251001100044.r b/.history/eohi2/dataP - recode DGEN vars_20251001100044.r deleted file mode 100644 index 9dc2e59..0000000 --- a/.history/eohi2/dataP - recode DGEN vars_20251001100044.r +++ /dev/null @@ -1,256 +0,0 @@ -# Script to combine DGEN variables in eohi2.csv -# Combines 01 and 02 versions of DGEN items (no recoding, just copying values) - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source column pairs (Set A and Set B) -# NOTE: fut5/fut10 columns use _8 suffix and "Values" spelling based on CSV header -source_cols_A <- c( - "01past5PrefDGEN_1", - "01past5PersDGEN_1", - "01past5ValDGEN_1", - "01past10PrefDGEN_1", - "01past10PersDGEN_1", - "01past10ValDGEN_1", - "01fut5PrefDGEN_8", - "01fut5PersDGEN_8", - "01fut5ValuesDGEN_1", - "01fut10PrefDGEN_8", - "01fut10PersDGEN_8", - "01fut10ValuesDGEN_1" -) - -source_cols_B <- c( - "02past5PrefDGEN_1", - "02past5PersDGEN_1", - "02past5ValDGEN_1", - "02past10PrefDGEN_1", - "02past10PersDGEN_1", - "02past10ValDGEN_1", - "02fut5PrefDGEN_8", - "02fut5PersDGEN_8", - "02fut5ValDGEN_1", - "02fut10PrefDGEN_8", - "02fut10PersDGEN_8", - "02fut10ValDGEN_1" -) - -# Define target column names -target_cols <- c( - "DGEN_past_5_Pref", - "DGEN_past_5_Pers", - "DGEN_past_5_Val", - "DGEN_past_10_Pref", - "DGEN_past_10_Pers", - "DGEN_past_10_Val", - "DGEN_fut_5_Pref", - "DGEN_fut_5_Pers", - "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", - "DGEN_fut_10_Pers", - "DGEN_fut_10_Val" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 6 || length(missing_B) > 6) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns (just copy values, no recoding) -for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - # No recoding - just copy the value directly - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Copy directly to target column (no recoding) - df[[target_col]] <- combined - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 12 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 12 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 12 pairs - for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values - val_A <- if (col_A %in% names(df)) df[random_row, col_A] else "" - val_B <- if (col_B %in% names(df)) df[random_row, col_B] else "" - target_val <- df[random_row, target_col] - - # Determine which source had the value - has_val_A <- !is.na(val_A) && val_A != "" - has_val_B <- !is.na(val_B) && val_B != "" - - if (has_val_A) { - source_used <- "A" - original_value <- val_A - } else if (has_val_B) { - source_used <- "B" - original_value <- val_B - } else { - source_used <- "NONE" - original_value <- "(empty)" - } - - # Print the info - cat(sprintf("Pair %2d:\n", i)) - cat(sprintf(" Source A: %-30s\n", col_A)) - cat(sprintf(" Source B: %-30s\n", col_B)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Value found in: Source %s\n", source_used)) - cat(sprintf(" Original value: '%s'\n", original_value)) - cat(sprintf(" Target value: '%s'\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 267 to save changes.\n") -cat("\nProcessing complete! 12 new columns created (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode DGEN vars_20251001100055.r b/.history/eohi2/dataP - recode DGEN vars_20251001100055.r deleted file mode 100644 index 9dc2e59..0000000 --- a/.history/eohi2/dataP - recode DGEN vars_20251001100055.r +++ /dev/null @@ -1,256 +0,0 @@ -# Script to combine DGEN variables in eohi2.csv -# Combines 01 and 02 versions of DGEN items (no recoding, just copying values) - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source column pairs (Set A and Set B) -# NOTE: fut5/fut10 columns use _8 suffix and "Values" spelling based on CSV header -source_cols_A <- c( - "01past5PrefDGEN_1", - "01past5PersDGEN_1", - "01past5ValDGEN_1", - "01past10PrefDGEN_1", - "01past10PersDGEN_1", - "01past10ValDGEN_1", - "01fut5PrefDGEN_8", - "01fut5PersDGEN_8", - "01fut5ValuesDGEN_1", - "01fut10PrefDGEN_8", - "01fut10PersDGEN_8", - "01fut10ValuesDGEN_1" -) - -source_cols_B <- c( - "02past5PrefDGEN_1", - "02past5PersDGEN_1", - "02past5ValDGEN_1", - "02past10PrefDGEN_1", - "02past10PersDGEN_1", - "02past10ValDGEN_1", - "02fut5PrefDGEN_8", - "02fut5PersDGEN_8", - "02fut5ValDGEN_1", - "02fut10PrefDGEN_8", - "02fut10PersDGEN_8", - "02fut10ValDGEN_1" -) - -# Define target column names -target_cols <- c( - "DGEN_past_5_Pref", - "DGEN_past_5_Pers", - "DGEN_past_5_Val", - "DGEN_past_10_Pref", - "DGEN_past_10_Pers", - "DGEN_past_10_Val", - "DGEN_fut_5_Pref", - "DGEN_fut_5_Pers", - "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", - "DGEN_fut_10_Pers", - "DGEN_fut_10_Val" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 6 || length(missing_B) > 6) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns (just copy values, no recoding) -for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - # No recoding - just copy the value directly - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Copy directly to target column (no recoding) - df[[target_col]] <- combined - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 12 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 12 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 12 pairs - for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values - val_A <- if (col_A %in% names(df)) df[random_row, col_A] else "" - val_B <- if (col_B %in% names(df)) df[random_row, col_B] else "" - target_val <- df[random_row, target_col] - - # Determine which source had the value - has_val_A <- !is.na(val_A) && val_A != "" - has_val_B <- !is.na(val_B) && val_B != "" - - if (has_val_A) { - source_used <- "A" - original_value <- val_A - } else if (has_val_B) { - source_used <- "B" - original_value <- val_B - } else { - source_used <- "NONE" - original_value <- "(empty)" - } - - # Print the info - cat(sprintf("Pair %2d:\n", i)) - cat(sprintf(" Source A: %-30s\n", col_A)) - cat(sprintf(" Source B: %-30s\n", col_B)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Value found in: Source %s\n", source_used)) - cat(sprintf(" Original value: '%s'\n", original_value)) - cat(sprintf(" Target value: '%s'\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 267 to save changes.\n") -cat("\nProcessing complete! 12 new columns created (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode DGEN vars_20251001100142.r b/.history/eohi2/dataP - recode DGEN vars_20251001100142.r deleted file mode 100644 index 9dc2e59..0000000 --- a/.history/eohi2/dataP - recode DGEN vars_20251001100142.r +++ /dev/null @@ -1,256 +0,0 @@ -# Script to combine DGEN variables in eohi2.csv -# Combines 01 and 02 versions of DGEN items (no recoding, just copying values) - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source column pairs (Set A and Set B) -# NOTE: fut5/fut10 columns use _8 suffix and "Values" spelling based on CSV header -source_cols_A <- c( - "01past5PrefDGEN_1", - "01past5PersDGEN_1", - "01past5ValDGEN_1", - "01past10PrefDGEN_1", - "01past10PersDGEN_1", - "01past10ValDGEN_1", - "01fut5PrefDGEN_8", - "01fut5PersDGEN_8", - "01fut5ValuesDGEN_1", - "01fut10PrefDGEN_8", - "01fut10PersDGEN_8", - "01fut10ValuesDGEN_1" -) - -source_cols_B <- c( - "02past5PrefDGEN_1", - "02past5PersDGEN_1", - "02past5ValDGEN_1", - "02past10PrefDGEN_1", - "02past10PersDGEN_1", - "02past10ValDGEN_1", - "02fut5PrefDGEN_8", - "02fut5PersDGEN_8", - "02fut5ValDGEN_1", - "02fut10PrefDGEN_8", - "02fut10PersDGEN_8", - "02fut10ValDGEN_1" -) - -# Define target column names -target_cols <- c( - "DGEN_past_5_Pref", - "DGEN_past_5_Pers", - "DGEN_past_5_Val", - "DGEN_past_10_Pref", - "DGEN_past_10_Pers", - "DGEN_past_10_Val", - "DGEN_fut_5_Pref", - "DGEN_fut_5_Pers", - "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", - "DGEN_fut_10_Pers", - "DGEN_fut_10_Val" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 6 || length(missing_B) > 6) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns (just copy values, no recoding) -for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - # No recoding - just copy the value directly - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Copy directly to target column (no recoding) - df[[target_col]] <- combined - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 12 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 12 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 12 pairs - for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values - val_A <- if (col_A %in% names(df)) df[random_row, col_A] else "" - val_B <- if (col_B %in% names(df)) df[random_row, col_B] else "" - target_val <- df[random_row, target_col] - - # Determine which source had the value - has_val_A <- !is.na(val_A) && val_A != "" - has_val_B <- !is.na(val_B) && val_B != "" - - if (has_val_A) { - source_used <- "A" - original_value <- val_A - } else if (has_val_B) { - source_used <- "B" - original_value <- val_B - } else { - source_used <- "NONE" - original_value <- "(empty)" - } - - # Print the info - cat(sprintf("Pair %2d:\n", i)) - cat(sprintf(" Source A: %-30s\n", col_A)) - cat(sprintf(" Source B: %-30s\n", col_B)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Value found in: Source %s\n", source_used)) - cat(sprintf(" Original value: '%s'\n", original_value)) - cat(sprintf(" Target value: '%s'\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 267 to save changes.\n") -cat("\nProcessing complete! 12 new columns created (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode DGEN vars_20251001100344.r b/.history/eohi2/dataP - recode DGEN vars_20251001100344.r deleted file mode 100644 index 29ed2af..0000000 --- a/.history/eohi2/dataP - recode DGEN vars_20251001100344.r +++ /dev/null @@ -1,256 +0,0 @@ -# Script to combine DGEN variables in eohi2.csv -# Combines 01 and 02 versions of DGEN items (no recoding, just copying values) - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source column pairs (Set A and Set B) -# NOTE: fut5/fut10 columns use _8 suffix and "Values" spelling based on CSV header -source_cols_A <- c( - "01past5PrefDGEN_1", - "01past5PersDGEN_1", - "01past5ValDGEN_1", - "01past10PrefDGEN_1", - "01past10PersDGEN_1", - "01past10ValDGEN_1", - "01fut5PrefDGEN_8", - "01fut5PersDGEN_8", - "01fut5ValuesDGEN_1", - "01fut10PrefDGEN_8", - "01fut10PersDGEN_8", - "01fut10ValuesDGEN_1" -) - -source_cols_B <- c( - "02past5PrefDGEN_1", - "02past5PersDGEN_1", - "02past5ValDGEN_1", - "02past10PrefDGEN_1", - "02past10PersDGEN_1", - "02past10ValDGEN_1", - "02fut5PrefDGEN_8", - "02fut5PersDGEN_8", - "02fut5ValDGEN_1", - "02fut10PrefDGEN_8", - "02fut10PersDGEN_8", - "02fut10ValDGEN_1" -) - -# Define target column names -target_cols <- c( - "DGEN_past_5_Pref", - "DGEN_past_5_Pers", - "DGEN_past_5_Val", - "DGEN_past_10_Pref", - "DGEN_past_10_Pers", - "DGEN_past_10_Val", - "DGEN_fut_5_Pref", - "DGEN_fut_5_Pers", - "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", - "DGEN_fut_10_Pers", - "DGEN_fut_10_Val" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 6 || length(missing_B) > 6) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns (just copy values, no recoding) -for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - # No recoding - just copy the value directly - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Copy directly to target column (no recoding) - df[[target_col]] <- combined - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 12 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 12 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK - CHECKING ROW #", random_row, " (out of ", nrow(df), " total rows)\n", sep = "") - cat("========================================\n\n") - - # Check each of the 12 pairs - for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values - val_A <- if (col_A %in% names(df)) df[random_row, col_A] else "" - val_B <- if (col_B %in% names(df)) df[random_row, col_B] else "" - target_val <- df[random_row, target_col] - - # Determine which source had the value - has_val_A <- !is.na(val_A) && val_A != "" - has_val_B <- !is.na(val_B) && val_B != "" - - if (has_val_A) { - source_used <- "A" - original_value <- val_A - } else if (has_val_B) { - source_used <- "B" - original_value <- val_B - } else { - source_used <- "NONE" - original_value <- "(empty)" - } - - # Print the info - cat(sprintf("Pair %2d:\n", i)) - cat(sprintf(" Source A: %-30s\n", col_A)) - cat(sprintf(" Source B: %-30s\n", col_B)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Value found in: Source %s\n", source_used)) - cat(sprintf(" Original value: '%s'\n", original_value)) - cat(sprintf(" Target value: '%s'\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 267 to save changes.\n") -cat("\nProcessing complete! 12 new columns created (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode DGEN vars_20251001100345.r b/.history/eohi2/dataP - recode DGEN vars_20251001100345.r deleted file mode 100644 index 29ed2af..0000000 --- a/.history/eohi2/dataP - recode DGEN vars_20251001100345.r +++ /dev/null @@ -1,256 +0,0 @@ -# Script to combine DGEN variables in eohi2.csv -# Combines 01 and 02 versions of DGEN items (no recoding, just copying values) - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source column pairs (Set A and Set B) -# NOTE: fut5/fut10 columns use _8 suffix and "Values" spelling based on CSV header -source_cols_A <- c( - "01past5PrefDGEN_1", - "01past5PersDGEN_1", - "01past5ValDGEN_1", - "01past10PrefDGEN_1", - "01past10PersDGEN_1", - "01past10ValDGEN_1", - "01fut5PrefDGEN_8", - "01fut5PersDGEN_8", - "01fut5ValuesDGEN_1", - "01fut10PrefDGEN_8", - "01fut10PersDGEN_8", - "01fut10ValuesDGEN_1" -) - -source_cols_B <- c( - "02past5PrefDGEN_1", - "02past5PersDGEN_1", - "02past5ValDGEN_1", - "02past10PrefDGEN_1", - "02past10PersDGEN_1", - "02past10ValDGEN_1", - "02fut5PrefDGEN_8", - "02fut5PersDGEN_8", - "02fut5ValDGEN_1", - "02fut10PrefDGEN_8", - "02fut10PersDGEN_8", - "02fut10ValDGEN_1" -) - -# Define target column names -target_cols <- c( - "DGEN_past_5_Pref", - "DGEN_past_5_Pers", - "DGEN_past_5_Val", - "DGEN_past_10_Pref", - "DGEN_past_10_Pers", - "DGEN_past_10_Val", - "DGEN_fut_5_Pref", - "DGEN_fut_5_Pers", - "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", - "DGEN_fut_10_Pers", - "DGEN_fut_10_Val" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 6 || length(missing_B) > 6) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns (just copy values, no recoding) -for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - # No recoding - just copy the value directly - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Copy directly to target column (no recoding) - df[[target_col]] <- combined - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 12 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 12 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK - CHECKING ROW #", random_row, " (out of ", nrow(df), " total rows)\n", sep = "") - cat("========================================\n\n") - - # Check each of the 12 pairs - for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values - val_A <- if (col_A %in% names(df)) df[random_row, col_A] else "" - val_B <- if (col_B %in% names(df)) df[random_row, col_B] else "" - target_val <- df[random_row, target_col] - - # Determine which source had the value - has_val_A <- !is.na(val_A) && val_A != "" - has_val_B <- !is.na(val_B) && val_B != "" - - if (has_val_A) { - source_used <- "A" - original_value <- val_A - } else if (has_val_B) { - source_used <- "B" - original_value <- val_B - } else { - source_used <- "NONE" - original_value <- "(empty)" - } - - # Print the info - cat(sprintf("Pair %2d:\n", i)) - cat(sprintf(" Source A: %-30s\n", col_A)) - cat(sprintf(" Source B: %-30s\n", col_B)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Value found in: Source %s\n", source_used)) - cat(sprintf(" Original value: '%s'\n", original_value)) - cat(sprintf(" Target value: '%s'\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 267 to save changes.\n") -cat("\nProcessing complete! 12 new columns created (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode DGEN vars_20251001100404.r b/.history/eohi2/dataP - recode DGEN vars_20251001100404.r deleted file mode 100644 index 9dc2e59..0000000 --- a/.history/eohi2/dataP - recode DGEN vars_20251001100404.r +++ /dev/null @@ -1,256 +0,0 @@ -# Script to combine DGEN variables in eohi2.csv -# Combines 01 and 02 versions of DGEN items (no recoding, just copying values) - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source column pairs (Set A and Set B) -# NOTE: fut5/fut10 columns use _8 suffix and "Values" spelling based on CSV header -source_cols_A <- c( - "01past5PrefDGEN_1", - "01past5PersDGEN_1", - "01past5ValDGEN_1", - "01past10PrefDGEN_1", - "01past10PersDGEN_1", - "01past10ValDGEN_1", - "01fut5PrefDGEN_8", - "01fut5PersDGEN_8", - "01fut5ValuesDGEN_1", - "01fut10PrefDGEN_8", - "01fut10PersDGEN_8", - "01fut10ValuesDGEN_1" -) - -source_cols_B <- c( - "02past5PrefDGEN_1", - "02past5PersDGEN_1", - "02past5ValDGEN_1", - "02past10PrefDGEN_1", - "02past10PersDGEN_1", - "02past10ValDGEN_1", - "02fut5PrefDGEN_8", - "02fut5PersDGEN_8", - "02fut5ValDGEN_1", - "02fut10PrefDGEN_8", - "02fut10PersDGEN_8", - "02fut10ValDGEN_1" -) - -# Define target column names -target_cols <- c( - "DGEN_past_5_Pref", - "DGEN_past_5_Pers", - "DGEN_past_5_Val", - "DGEN_past_10_Pref", - "DGEN_past_10_Pers", - "DGEN_past_10_Val", - "DGEN_fut_5_Pref", - "DGEN_fut_5_Pers", - "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", - "DGEN_fut_10_Pers", - "DGEN_fut_10_Val" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 6 || length(missing_B) > 6) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns (just copy values, no recoding) -for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - # No recoding - just copy the value directly - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Copy directly to target column (no recoding) - df[[target_col]] <- combined - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 12 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 12 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 12 pairs - for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values - val_A <- if (col_A %in% names(df)) df[random_row, col_A] else "" - val_B <- if (col_B %in% names(df)) df[random_row, col_B] else "" - target_val <- df[random_row, target_col] - - # Determine which source had the value - has_val_A <- !is.na(val_A) && val_A != "" - has_val_B <- !is.na(val_B) && val_B != "" - - if (has_val_A) { - source_used <- "A" - original_value <- val_A - } else if (has_val_B) { - source_used <- "B" - original_value <- val_B - } else { - source_used <- "NONE" - original_value <- "(empty)" - } - - # Print the info - cat(sprintf("Pair %2d:\n", i)) - cat(sprintf(" Source A: %-30s\n", col_A)) - cat(sprintf(" Source B: %-30s\n", col_B)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Value found in: Source %s\n", source_used)) - cat(sprintf(" Original value: '%s'\n", original_value)) - cat(sprintf(" Target value: '%s'\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 267 to save changes.\n") -cat("\nProcessing complete! 12 new columns created (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode DGEN vars_20251001100408.r b/.history/eohi2/dataP - recode DGEN vars_20251001100408.r deleted file mode 100644 index 9dc2e59..0000000 --- a/.history/eohi2/dataP - recode DGEN vars_20251001100408.r +++ /dev/null @@ -1,256 +0,0 @@ -# Script to combine DGEN variables in eohi2.csv -# Combines 01 and 02 versions of DGEN items (no recoding, just copying values) - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source column pairs (Set A and Set B) -# NOTE: fut5/fut10 columns use _8 suffix and "Values" spelling based on CSV header -source_cols_A <- c( - "01past5PrefDGEN_1", - "01past5PersDGEN_1", - "01past5ValDGEN_1", - "01past10PrefDGEN_1", - "01past10PersDGEN_1", - "01past10ValDGEN_1", - "01fut5PrefDGEN_8", - "01fut5PersDGEN_8", - "01fut5ValuesDGEN_1", - "01fut10PrefDGEN_8", - "01fut10PersDGEN_8", - "01fut10ValuesDGEN_1" -) - -source_cols_B <- c( - "02past5PrefDGEN_1", - "02past5PersDGEN_1", - "02past5ValDGEN_1", - "02past10PrefDGEN_1", - "02past10PersDGEN_1", - "02past10ValDGEN_1", - "02fut5PrefDGEN_8", - "02fut5PersDGEN_8", - "02fut5ValDGEN_1", - "02fut10PrefDGEN_8", - "02fut10PersDGEN_8", - "02fut10ValDGEN_1" -) - -# Define target column names -target_cols <- c( - "DGEN_past_5_Pref", - "DGEN_past_5_Pers", - "DGEN_past_5_Val", - "DGEN_past_10_Pref", - "DGEN_past_10_Pers", - "DGEN_past_10_Val", - "DGEN_fut_5_Pref", - "DGEN_fut_5_Pers", - "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", - "DGEN_fut_10_Pers", - "DGEN_fut_10_Val" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 6 || length(missing_B) > 6) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns (just copy values, no recoding) -for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - # No recoding - just copy the value directly - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Copy directly to target column (no recoding) - df[[target_col]] <- combined - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 12 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 12 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 12 pairs - for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values - val_A <- if (col_A %in% names(df)) df[random_row, col_A] else "" - val_B <- if (col_B %in% names(df)) df[random_row, col_B] else "" - target_val <- df[random_row, target_col] - - # Determine which source had the value - has_val_A <- !is.na(val_A) && val_A != "" - has_val_B <- !is.na(val_B) && val_B != "" - - if (has_val_A) { - source_used <- "A" - original_value <- val_A - } else if (has_val_B) { - source_used <- "B" - original_value <- val_B - } else { - source_used <- "NONE" - original_value <- "(empty)" - } - - # Print the info - cat(sprintf("Pair %2d:\n", i)) - cat(sprintf(" Source A: %-30s\n", col_A)) - cat(sprintf(" Source B: %-30s\n", col_B)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Value found in: Source %s\n", source_used)) - cat(sprintf(" Original value: '%s'\n", original_value)) - cat(sprintf(" Target value: '%s'\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 267 to save changes.\n") -cat("\nProcessing complete! 12 new columns created (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode DGEN vars_20251001100532.r b/.history/eohi2/dataP - recode DGEN vars_20251001100532.r deleted file mode 100644 index 66f25ab..0000000 --- a/.history/eohi2/dataP - recode DGEN vars_20251001100532.r +++ /dev/null @@ -1,255 +0,0 @@ -# Script to combine DGEN variables in eohi2.csv -# Combines 01 and 02 versions of DGEN items (no recoding, just copying values) - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source column pairs (Set A and Set B) -# NOTE: fut5/fut10 columns use _8 suffix and "Values" spelling based on CSV header -source_cols_A <- c( - "01past5PrefDGEN_1", - "01past5PersDGEN_1", - "01past5ValDGEN_1", - "01past10PrefDGEN_1", - "01past10PersDGEN_1", - "01past10ValDGEN_1", - "01fut5PrefDGEN_8", - "01fut5PersDGEN_8", - "01fut5ValuesDGEN_1", - "01fut10PrefDGEN_8", - "01fut10PersDGEN_8", - "01fut10ValuesDGEN_1" -) - -source_cols_B <- c( - "02past5PrefDGEN_1", - "02past5PersDGEN_1", - "02past5ValDGEN_1", - "02past10PrefDGEN_1", - "02past10PersDGEN_1", - "02past10ValDGEN_1", - "02fut5PrefDGEN_8", - "02fut5PersDGEN_8", - "02fut5ValDGEN_1", - "02fut10PrefDGEN_8", - "02fut10PersDGEN_8", - "02fut10ValDGEN_1" -) - -# Define target column names -target_cols <- c( - "DGEN_past_5_Pref", - "DGEN_past_5_Pers", - "DGEN_past_5_Val", - "DGEN_past_10_Pref", - "DGEN_past_10_Pers", - "DGEN_past_10_Val", - "DGEN_fut_5_Pref", - "DGEN_fut_5_Pers", - "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", - "DGEN_fut_10_Pers", - "DGEN_fut_10_Val" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 6 || length(missing_B) > 6) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns (just copy values, no recoding) -for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - # No recoding - just copy the value directly - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Copy directly to target column (no recoding) - df[[target_col]] <- combined - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 12 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 12 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 12 pairs - for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values - val_A <- if (col_A %in% names(df)) df[random_row, col_A] else "" - val_B <- if (col_B %in% names(df)) df[random_row, col_B] else "" - target_val <- df[random_row, target_col] - - # Determine which source had the value - has_val_A <- !is.na(val_A) && val_A != "" - has_val_B <- !is.na(val_B) && val_B != "" - - if (has_val_A) { - source_used <- "A" - original_value <- val_A - } else if (has_val_B) { - source_used <- "B" - original_value <- val_B - } else { - source_used <- "NONE" - original_value <- "(empty)" - } - - # Print the info - cat(sprintf("Pair %2d:\n", i)) - cat(sprintf(" Source A: %-30s\n", col_A)) - cat(sprintf(" Source B: %-30s\n", col_B)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Value found in: Source %s\n", source_used)) - cat(sprintf(" Original value: '%s'\n", original_value)) - cat(sprintf(" Target value: '%s'\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 267 to save changes.\n") -cat("\nProcessing complete! 12 new columns created (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode DGEN vars_20251001100534.r b/.history/eohi2/dataP - recode DGEN vars_20251001100534.r deleted file mode 100644 index 66f25ab..0000000 --- a/.history/eohi2/dataP - recode DGEN vars_20251001100534.r +++ /dev/null @@ -1,255 +0,0 @@ -# Script to combine DGEN variables in eohi2.csv -# Combines 01 and 02 versions of DGEN items (no recoding, just copying values) - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source column pairs (Set A and Set B) -# NOTE: fut5/fut10 columns use _8 suffix and "Values" spelling based on CSV header -source_cols_A <- c( - "01past5PrefDGEN_1", - "01past5PersDGEN_1", - "01past5ValDGEN_1", - "01past10PrefDGEN_1", - "01past10PersDGEN_1", - "01past10ValDGEN_1", - "01fut5PrefDGEN_8", - "01fut5PersDGEN_8", - "01fut5ValuesDGEN_1", - "01fut10PrefDGEN_8", - "01fut10PersDGEN_8", - "01fut10ValuesDGEN_1" -) - -source_cols_B <- c( - "02past5PrefDGEN_1", - "02past5PersDGEN_1", - "02past5ValDGEN_1", - "02past10PrefDGEN_1", - "02past10PersDGEN_1", - "02past10ValDGEN_1", - "02fut5PrefDGEN_8", - "02fut5PersDGEN_8", - "02fut5ValDGEN_1", - "02fut10PrefDGEN_8", - "02fut10PersDGEN_8", - "02fut10ValDGEN_1" -) - -# Define target column names -target_cols <- c( - "DGEN_past_5_Pref", - "DGEN_past_5_Pers", - "DGEN_past_5_Val", - "DGEN_past_10_Pref", - "DGEN_past_10_Pers", - "DGEN_past_10_Val", - "DGEN_fut_5_Pref", - "DGEN_fut_5_Pers", - "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", - "DGEN_fut_10_Pers", - "DGEN_fut_10_Val" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 6 || length(missing_B) > 6) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns (just copy values, no recoding) -for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - # No recoding - just copy the value directly - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Copy directly to target column (no recoding) - df[[target_col]] <- combined - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 12 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 12 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 12 pairs - for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values - val_A <- if (col_A %in% names(df)) df[random_row, col_A] else "" - val_B <- if (col_B %in% names(df)) df[random_row, col_B] else "" - target_val <- df[random_row, target_col] - - # Determine which source had the value - has_val_A <- !is.na(val_A) && val_A != "" - has_val_B <- !is.na(val_B) && val_B != "" - - if (has_val_A) { - source_used <- "A" - original_value <- val_A - } else if (has_val_B) { - source_used <- "B" - original_value <- val_B - } else { - source_used <- "NONE" - original_value <- "(empty)" - } - - # Print the info - cat(sprintf("Pair %2d:\n", i)) - cat(sprintf(" Source A: %-30s\n", col_A)) - cat(sprintf(" Source B: %-30s\n", col_B)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Value found in: Source %s\n", source_used)) - cat(sprintf(" Original value: '%s'\n", original_value)) - cat(sprintf(" Target value: '%s'\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 267 to save changes.\n") -cat("\nProcessing complete! 12 new columns created (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode DGEN vars_20251001100537.r b/.history/eohi2/dataP - recode DGEN vars_20251001100537.r deleted file mode 100644 index abcc128..0000000 --- a/.history/eohi2/dataP - recode DGEN vars_20251001100537.r +++ /dev/null @@ -1,253 +0,0 @@ -# Script to combine DGEN variables in eohi2.csv -# Combines 01 and 02 versions of DGEN items (no recoding, just copying values) - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source column pairs (Set A and Set B) -# NOTE: fut5/fut10 columns use _8 suffix and "Values" spelling based on CSV header -source_cols_A <- c( - "01past5PrefDGEN_1", - "01past5PersDGEN_1", - "01past5ValDGEN_1", - "01past10PrefDGEN_1", - "01past10PersDGEN_1", - "01past10ValDGEN_1", - "01fut5PrefDGEN_8", - "01fut5PersDGEN_8", - "01fut5ValuesDGEN_1", - "01fut10PrefDGEN_8", - "01fut10PersDGEN_8", - "01fut10ValuesDGEN_1" -) - -source_cols_B <- c( - "02past5PrefDGEN_1", - "02past5PersDGEN_1", - "02past5ValDGEN_1", - "02past10PrefDGEN_1", - "02past10PersDGEN_1", - "02past10ValDGEN_1", - "02fut5PrefDGEN_8", - "02fut5PersDGEN_8", - "02fut5ValDGEN_1", - "02fut10PrefDGEN_8", - "02fut10PersDGEN_8", - "02fut10ValDGEN_1" -) - -# Define target column names -target_cols <- c( - "DGEN_past_5_Pref", - "DGEN_past_5_Pers", - "DGEN_past_5_Val", - "DGEN_past_10_Pref", - "DGEN_past_10_Pers", - "DGEN_past_10_Val", - "DGEN_fut_5_Pref", - "DGEN_fut_5_Pers", - "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", - "DGEN_fut_10_Pers", - "DGEN_fut_10_Val" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 6 || length(missing_B) > 6) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns (just copy values, no recoding) -for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - # No recoding - just copy the value directly - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Copy directly to target column (no recoding) - df[[target_col]] <- combined - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 12 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 12 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 12 pairs - for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values - val_A <- if (col_A %in% names(df)) df[random_row, col_A] else "" - val_B <- if (col_B %in% names(df)) df[random_row, col_B] else "" - target_val <- df[random_row, target_col] - - # Determine which source had the value - has_val_A <- !is.na(val_A) && val_A != "" - has_val_B <- !is.na(val_B) && val_B != "" - - if (has_val_A) { - source_used <- "A" - original_value <- val_A - } else if (has_val_B) { - source_used <- "B" - original_value <- val_B - } else { - source_used <- "NONE" - original_value <- "(empty)" - } - - # Print the info - cat(sprintf("Pair %2d:\n", i)) - cat(sprintf(" Source A: %-30s\n", col_A)) - cat(sprintf(" Source B: %-30s\n", col_B)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Value found in: Source %s\n", source_used)) - cat(sprintf(" Original value: '%s'\n", original_value)) - cat(sprintf(" Target value: '%s'\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\nProcessing complete! 12 new columns added to eohi2.csv\n") diff --git a/.history/eohi2/dataP - recode DGEN vars_20251001100544.r b/.history/eohi2/dataP - recode DGEN vars_20251001100544.r deleted file mode 100644 index abcc128..0000000 --- a/.history/eohi2/dataP - recode DGEN vars_20251001100544.r +++ /dev/null @@ -1,253 +0,0 @@ -# Script to combine DGEN variables in eohi2.csv -# Combines 01 and 02 versions of DGEN items (no recoding, just copying values) - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source column pairs (Set A and Set B) -# NOTE: fut5/fut10 columns use _8 suffix and "Values" spelling based on CSV header -source_cols_A <- c( - "01past5PrefDGEN_1", - "01past5PersDGEN_1", - "01past5ValDGEN_1", - "01past10PrefDGEN_1", - "01past10PersDGEN_1", - "01past10ValDGEN_1", - "01fut5PrefDGEN_8", - "01fut5PersDGEN_8", - "01fut5ValuesDGEN_1", - "01fut10PrefDGEN_8", - "01fut10PersDGEN_8", - "01fut10ValuesDGEN_1" -) - -source_cols_B <- c( - "02past5PrefDGEN_1", - "02past5PersDGEN_1", - "02past5ValDGEN_1", - "02past10PrefDGEN_1", - "02past10PersDGEN_1", - "02past10ValDGEN_1", - "02fut5PrefDGEN_8", - "02fut5PersDGEN_8", - "02fut5ValDGEN_1", - "02fut10PrefDGEN_8", - "02fut10PersDGEN_8", - "02fut10ValDGEN_1" -) - -# Define target column names -target_cols <- c( - "DGEN_past_5_Pref", - "DGEN_past_5_Pers", - "DGEN_past_5_Val", - "DGEN_past_10_Pref", - "DGEN_past_10_Pers", - "DGEN_past_10_Val", - "DGEN_fut_5_Pref", - "DGEN_fut_5_Pers", - "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", - "DGEN_fut_10_Pers", - "DGEN_fut_10_Val" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 6 || length(missing_B) > 6) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns (just copy values, no recoding) -for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - # No recoding - just copy the value directly - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Copy directly to target column (no recoding) - df[[target_col]] <- combined - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 12 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 12 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 12 pairs - for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values - val_A <- if (col_A %in% names(df)) df[random_row, col_A] else "" - val_B <- if (col_B %in% names(df)) df[random_row, col_B] else "" - target_val <- df[random_row, target_col] - - # Determine which source had the value - has_val_A <- !is.na(val_A) && val_A != "" - has_val_B <- !is.na(val_B) && val_B != "" - - if (has_val_A) { - source_used <- "A" - original_value <- val_A - } else if (has_val_B) { - source_used <- "B" - original_value <- val_B - } else { - source_used <- "NONE" - original_value <- "(empty)" - } - - # Print the info - cat(sprintf("Pair %2d:\n", i)) - cat(sprintf(" Source A: %-30s\n", col_A)) - cat(sprintf(" Source B: %-30s\n", col_B)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Value found in: Source %s\n", source_used)) - cat(sprintf(" Original value: '%s'\n", original_value)) - cat(sprintf(" Target value: '%s'\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\nProcessing complete! 12 new columns added to eohi2.csv\n") diff --git a/.history/eohi2/dataP - recode DGEN vars_20251001100547.r b/.history/eohi2/dataP - recode DGEN vars_20251001100547.r deleted file mode 100644 index abcc128..0000000 --- a/.history/eohi2/dataP - recode DGEN vars_20251001100547.r +++ /dev/null @@ -1,253 +0,0 @@ -# Script to combine DGEN variables in eohi2.csv -# Combines 01 and 02 versions of DGEN items (no recoding, just copying values) - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source column pairs (Set A and Set B) -# NOTE: fut5/fut10 columns use _8 suffix and "Values" spelling based on CSV header -source_cols_A <- c( - "01past5PrefDGEN_1", - "01past5PersDGEN_1", - "01past5ValDGEN_1", - "01past10PrefDGEN_1", - "01past10PersDGEN_1", - "01past10ValDGEN_1", - "01fut5PrefDGEN_8", - "01fut5PersDGEN_8", - "01fut5ValuesDGEN_1", - "01fut10PrefDGEN_8", - "01fut10PersDGEN_8", - "01fut10ValuesDGEN_1" -) - -source_cols_B <- c( - "02past5PrefDGEN_1", - "02past5PersDGEN_1", - "02past5ValDGEN_1", - "02past10PrefDGEN_1", - "02past10PersDGEN_1", - "02past10ValDGEN_1", - "02fut5PrefDGEN_8", - "02fut5PersDGEN_8", - "02fut5ValDGEN_1", - "02fut10PrefDGEN_8", - "02fut10PersDGEN_8", - "02fut10ValDGEN_1" -) - -# Define target column names -target_cols <- c( - "DGEN_past_5_Pref", - "DGEN_past_5_Pers", - "DGEN_past_5_Val", - "DGEN_past_10_Pref", - "DGEN_past_10_Pers", - "DGEN_past_10_Val", - "DGEN_fut_5_Pref", - "DGEN_fut_5_Pers", - "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", - "DGEN_fut_10_Pers", - "DGEN_fut_10_Val" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 6 || length(missing_B) > 6) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns (just copy values, no recoding) -for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - # No recoding - just copy the value directly - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Copy directly to target column (no recoding) - df[[target_col]] <- combined - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 12 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 12 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 12 pairs - for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values - val_A <- if (col_A %in% names(df)) df[random_row, col_A] else "" - val_B <- if (col_B %in% names(df)) df[random_row, col_B] else "" - target_val <- df[random_row, target_col] - - # Determine which source had the value - has_val_A <- !is.na(val_A) && val_A != "" - has_val_B <- !is.na(val_B) && val_B != "" - - if (has_val_A) { - source_used <- "A" - original_value <- val_A - } else if (has_val_B) { - source_used <- "B" - original_value <- val_B - } else { - source_used <- "NONE" - original_value <- "(empty)" - } - - # Print the info - cat(sprintf("Pair %2d:\n", i)) - cat(sprintf(" Source A: %-30s\n", col_A)) - cat(sprintf(" Source B: %-30s\n", col_B)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Value found in: Source %s\n", source_used)) - cat(sprintf(" Original value: '%s'\n", original_value)) - cat(sprintf(" Target value: '%s'\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\nProcessing complete! 12 new columns added to eohi2.csv\n") diff --git a/.history/eohi2/dataP - recode DGEN vars_20251001100624.r b/.history/eohi2/dataP - recode DGEN vars_20251001100624.r deleted file mode 100644 index abcc128..0000000 --- a/.history/eohi2/dataP - recode DGEN vars_20251001100624.r +++ /dev/null @@ -1,253 +0,0 @@ -# Script to combine DGEN variables in eohi2.csv -# Combines 01 and 02 versions of DGEN items (no recoding, just copying values) - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source column pairs (Set A and Set B) -# NOTE: fut5/fut10 columns use _8 suffix and "Values" spelling based on CSV header -source_cols_A <- c( - "01past5PrefDGEN_1", - "01past5PersDGEN_1", - "01past5ValDGEN_1", - "01past10PrefDGEN_1", - "01past10PersDGEN_1", - "01past10ValDGEN_1", - "01fut5PrefDGEN_8", - "01fut5PersDGEN_8", - "01fut5ValuesDGEN_1", - "01fut10PrefDGEN_8", - "01fut10PersDGEN_8", - "01fut10ValuesDGEN_1" -) - -source_cols_B <- c( - "02past5PrefDGEN_1", - "02past5PersDGEN_1", - "02past5ValDGEN_1", - "02past10PrefDGEN_1", - "02past10PersDGEN_1", - "02past10ValDGEN_1", - "02fut5PrefDGEN_8", - "02fut5PersDGEN_8", - "02fut5ValDGEN_1", - "02fut10PrefDGEN_8", - "02fut10PersDGEN_8", - "02fut10ValDGEN_1" -) - -# Define target column names -target_cols <- c( - "DGEN_past_5_Pref", - "DGEN_past_5_Pers", - "DGEN_past_5_Val", - "DGEN_past_10_Pref", - "DGEN_past_10_Pers", - "DGEN_past_10_Val", - "DGEN_fut_5_Pref", - "DGEN_fut_5_Pers", - "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", - "DGEN_fut_10_Pers", - "DGEN_fut_10_Val" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 6 || length(missing_B) > 6) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns (just copy values, no recoding) -for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - # No recoding - just copy the value directly - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Copy directly to target column (no recoding) - df[[target_col]] <- combined - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 12 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 12 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 12 pairs - for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values - val_A <- if (col_A %in% names(df)) df[random_row, col_A] else "" - val_B <- if (col_B %in% names(df)) df[random_row, col_B] else "" - target_val <- df[random_row, target_col] - - # Determine which source had the value - has_val_A <- !is.na(val_A) && val_A != "" - has_val_B <- !is.na(val_B) && val_B != "" - - if (has_val_A) { - source_used <- "A" - original_value <- val_A - } else if (has_val_B) { - source_used <- "B" - original_value <- val_B - } else { - source_used <- "NONE" - original_value <- "(empty)" - } - - # Print the info - cat(sprintf("Pair %2d:\n", i)) - cat(sprintf(" Source A: %-30s\n", col_A)) - cat(sprintf(" Source B: %-30s\n", col_B)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Value found in: Source %s\n", source_used)) - cat(sprintf(" Original value: '%s'\n", original_value)) - cat(sprintf(" Target value: '%s'\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\nProcessing complete! 12 new columns added to eohi2.csv\n") diff --git a/.history/eohi2/dataP - recode DGEN vars_20251001105736.r b/.history/eohi2/dataP - recode DGEN vars_20251001105736.r deleted file mode 100644 index abcc128..0000000 --- a/.history/eohi2/dataP - recode DGEN vars_20251001105736.r +++ /dev/null @@ -1,253 +0,0 @@ -# Script to combine DGEN variables in eohi2.csv -# Combines 01 and 02 versions of DGEN items (no recoding, just copying values) - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source column pairs (Set A and Set B) -# NOTE: fut5/fut10 columns use _8 suffix and "Values" spelling based on CSV header -source_cols_A <- c( - "01past5PrefDGEN_1", - "01past5PersDGEN_1", - "01past5ValDGEN_1", - "01past10PrefDGEN_1", - "01past10PersDGEN_1", - "01past10ValDGEN_1", - "01fut5PrefDGEN_8", - "01fut5PersDGEN_8", - "01fut5ValuesDGEN_1", - "01fut10PrefDGEN_8", - "01fut10PersDGEN_8", - "01fut10ValuesDGEN_1" -) - -source_cols_B <- c( - "02past5PrefDGEN_1", - "02past5PersDGEN_1", - "02past5ValDGEN_1", - "02past10PrefDGEN_1", - "02past10PersDGEN_1", - "02past10ValDGEN_1", - "02fut5PrefDGEN_8", - "02fut5PersDGEN_8", - "02fut5ValDGEN_1", - "02fut10PrefDGEN_8", - "02fut10PersDGEN_8", - "02fut10ValDGEN_1" -) - -# Define target column names -target_cols <- c( - "DGEN_past_5_Pref", - "DGEN_past_5_Pers", - "DGEN_past_5_Val", - "DGEN_past_10_Pref", - "DGEN_past_10_Pers", - "DGEN_past_10_Val", - "DGEN_fut_5_Pref", - "DGEN_fut_5_Pers", - "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", - "DGEN_fut_10_Pers", - "DGEN_fut_10_Val" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 12 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 6 || length(missing_B) > 6) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns (just copy values, no recoding) -for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - # No recoding - just copy the value directly - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Copy directly to target column (no recoding) - df[[target_col]] <- combined - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 12 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 12 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 12 pairs - for (i in 1:12) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values - val_A <- if (col_A %in% names(df)) df[random_row, col_A] else "" - val_B <- if (col_B %in% names(df)) df[random_row, col_B] else "" - target_val <- df[random_row, target_col] - - # Determine which source had the value - has_val_A <- !is.na(val_A) && val_A != "" - has_val_B <- !is.na(val_B) && val_B != "" - - if (has_val_A) { - source_used <- "A" - original_value <- val_A - } else if (has_val_B) { - source_used <- "B" - original_value <- val_B - } else { - source_used <- "NONE" - original_value <- "(empty)" - } - - # Print the info - cat(sprintf("Pair %2d:\n", i)) - cat(sprintf(" Source A: %-30s\n", col_A)) - cat(sprintf(" Source B: %-30s\n", col_B)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Value found in: Source %s\n", source_used)) - cat(sprintf(" Original value: '%s'\n", original_value)) - cat(sprintf(" Target value: '%s'\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\nProcessing complete! 12 new columns added to eohi2.csv\n") diff --git a/.history/eohi2/dataP - recode present VARS_20251001110617.r b/.history/eohi2/dataP - recode present VARS_20251001110617.r deleted file mode 100644 index da4221c..0000000 --- a/.history/eohi2/dataP - recode present VARS_20251001110617.r +++ /dev/null @@ -1,197 +0,0 @@ -# Script to recode present-time Likert scale items in eohi2.csv -# Recodes prePrefItem, prePersItem, and preValItem to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source columns (15 columns total) -source_cols <- c( - "prePrefItem_1", "prePrefItem_2", "prePrefItem_3", "prePrefItem_4", "prePrefItem_5", - "prePersItem_1", "prePersItem_2", "prePersItem_3", "prePersItem_4", "prePersItem_5", - "preValItem_1", "preValItem_2", "preValItem_3", "preValItem_4", "preValItem_5" -) - -# Define target column names (15 columns total) -target_cols <- c( - "present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel", - "present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex", - "present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 15 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0) { - cat("\n Missing columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_source) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - for (miss_col in missing_source) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 7) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each column -for (i in 1:15) { - source_col <- source_cols[i] - target_col <- target_cols[i] - - # Get values from source column, handling missing columns - source_vals <- if (source_col %in% names(df)) df[[source_col]] else rep(NA, nrow(df)) - - # Recode to numeric - df[[target_col]] <- recode_likert(source_vals) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 15 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 15 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 15 columns - for (i in 1:15) { - source_col <- source_cols[i] - target_col <- target_cols[i] - - # Get values - source_val <- if (source_col %in% names(df)) df[random_row, source_col] else "" - target_val <- df[random_row, target_col] - - # Determine if source has a value - has_val <- !is.na(source_val) && source_val != "" - - original_text <- if (has_val) source_val else "(empty)" - - # Print the info - cat(sprintf("Column %2d:\n", i)) - cat(sprintf(" Source: %-30s\n", source_col)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Original text: '%s'\n", original_text)) - cat(sprintf(" Numeric value: %s\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 189 to save changes.\n") -cat("\nProcessing complete! 15 new columns created (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode present VARS_20251001110629.r b/.history/eohi2/dataP - recode present VARS_20251001110629.r deleted file mode 100644 index da4221c..0000000 --- a/.history/eohi2/dataP - recode present VARS_20251001110629.r +++ /dev/null @@ -1,197 +0,0 @@ -# Script to recode present-time Likert scale items in eohi2.csv -# Recodes prePrefItem, prePersItem, and preValItem to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source columns (15 columns total) -source_cols <- c( - "prePrefItem_1", "prePrefItem_2", "prePrefItem_3", "prePrefItem_4", "prePrefItem_5", - "prePersItem_1", "prePersItem_2", "prePersItem_3", "prePersItem_4", "prePersItem_5", - "preValItem_1", "preValItem_2", "preValItem_3", "preValItem_4", "preValItem_5" -) - -# Define target column names (15 columns total) -target_cols <- c( - "present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel", - "present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex", - "present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 15 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0) { - cat("\n Missing columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_source) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - for (miss_col in missing_source) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 7) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each column -for (i in 1:15) { - source_col <- source_cols[i] - target_col <- target_cols[i] - - # Get values from source column, handling missing columns - source_vals <- if (source_col %in% names(df)) df[[source_col]] else rep(NA, nrow(df)) - - # Recode to numeric - df[[target_col]] <- recode_likert(source_vals) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 15 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 15 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 15 columns - for (i in 1:15) { - source_col <- source_cols[i] - target_col <- target_cols[i] - - # Get values - source_val <- if (source_col %in% names(df)) df[random_row, source_col] else "" - target_val <- df[random_row, target_col] - - # Determine if source has a value - has_val <- !is.na(source_val) && source_val != "" - - original_text <- if (has_val) source_val else "(empty)" - - # Print the info - cat(sprintf("Column %2d:\n", i)) - cat(sprintf(" Source: %-30s\n", source_col)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Original text: '%s'\n", original_text)) - cat(sprintf(" Numeric value: %s\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 189 to save changes.\n") -cat("\nProcessing complete! 15 new columns created (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode present VARS_20251001110731.r b/.history/eohi2/dataP - recode present VARS_20251001110731.r deleted file mode 100644 index da4221c..0000000 --- a/.history/eohi2/dataP - recode present VARS_20251001110731.r +++ /dev/null @@ -1,197 +0,0 @@ -# Script to recode present-time Likert scale items in eohi2.csv -# Recodes prePrefItem, prePersItem, and preValItem to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source columns (15 columns total) -source_cols <- c( - "prePrefItem_1", "prePrefItem_2", "prePrefItem_3", "prePrefItem_4", "prePrefItem_5", - "prePersItem_1", "prePersItem_2", "prePersItem_3", "prePersItem_4", "prePersItem_5", - "preValItem_1", "preValItem_2", "preValItem_3", "preValItem_4", "preValItem_5" -) - -# Define target column names (15 columns total) -target_cols <- c( - "present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel", - "present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex", - "present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 15 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0) { - cat("\n Missing columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_source) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - for (miss_col in missing_source) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 7) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each column -for (i in 1:15) { - source_col <- source_cols[i] - target_col <- target_cols[i] - - # Get values from source column, handling missing columns - source_vals <- if (source_col %in% names(df)) df[[source_col]] else rep(NA, nrow(df)) - - # Recode to numeric - df[[target_col]] <- recode_likert(source_vals) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 15 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 15 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 15 columns - for (i in 1:15) { - source_col <- source_cols[i] - target_col <- target_cols[i] - - # Get values - source_val <- if (source_col %in% names(df)) df[random_row, source_col] else "" - target_val <- df[random_row, target_col] - - # Determine if source has a value - has_val <- !is.na(source_val) && source_val != "" - - original_text <- if (has_val) source_val else "(empty)" - - # Print the info - cat(sprintf("Column %2d:\n", i)) - cat(sprintf(" Source: %-30s\n", source_col)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Original text: '%s'\n", original_text)) - cat(sprintf(" Numeric value: %s\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 189 to save changes.\n") -cat("\nProcessing complete! 15 new columns created (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode present VARS_20251001110926.r b/.history/eohi2/dataP - recode present VARS_20251001110926.r deleted file mode 100644 index 02a4b9d..0000000 --- a/.history/eohi2/dataP - recode present VARS_20251001110926.r +++ /dev/null @@ -1,192 +0,0 @@ -# Script to recode present-time Likert scale items in eohi2.csv -# Recodes prePrefItem, prePersItem, and preValItem to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source columns (15 columns total) -source_cols <- c( - "prePrefItem_1", "prePrefItem_2", "prePrefItem_3", "prePrefItem_4", "prePrefItem_5", - "prePersItem_1", "prePersItem_2", "prePersItem_3", "prePersItem_4", "prePersItem_5", - "preValItem_1", "preValItem_2", "preValItem_3", "preValItem_4", "preValItem_5" -) - -# Define target column names (15 columns total) -target_cols <- c( - "present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel", - "present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex", - "present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 15 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0) { - cat("\n Missing columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_source) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - for (miss_col in missing_source) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 7) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Check that target columns exist in the dataframe -cat("\n=== CHECKING TARGET COLUMNS ===\n") -existing_targets <- target_cols[target_cols %in% df_cols] -missing_targets <- target_cols[!target_cols %in% df_cols] - -cat("Target Columns (should already exist):\n") -cat(" Expected: 15 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns do NOT exist:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Target columns missing! Cannot overwrite.") -} - -cat(" All target columns found. Will overwrite with recoded values.\n\n") - -# Process each column (overwrite existing target columns with recoded values) -for (i in 1:15) { - source_col <- source_cols[i] - target_col <- target_cols[i] - - # Get values from source column, handling missing columns - source_vals <- if (source_col %in% names(df)) df[[source_col]] else rep(NA, nrow(df)) - - # Recode to numeric and overwrite existing target column - df[[target_col]] <- recode_likert(source_vals) - - # Print progress - cat("Processed:", target_col, "\n") -} - -cat("\n=== RECODING COMPLETE ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 15 columns - for (i in 1:15) { - source_col <- source_cols[i] - target_col <- target_cols[i] - - # Get values - source_val <- if (source_col %in% names(df)) df[random_row, source_col] else "" - target_val <- df[random_row, target_col] - - # Determine if source has a value - has_val <- !is.na(source_val) && source_val != "" - - original_text <- if (has_val) source_val else "(empty)" - - # Print the info - cat(sprintf("Column %2d:\n", i)) - cat(sprintf(" Source: %-30s\n", source_col)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Original text: '%s'\n", original_text)) - cat(sprintf(" Numeric value: %s\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 189 to save changes.\n") -cat("\nProcessing complete! 15 new columns created (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode present VARS_20251001110934.r b/.history/eohi2/dataP - recode present VARS_20251001110934.r deleted file mode 100644 index 02a4b9d..0000000 --- a/.history/eohi2/dataP - recode present VARS_20251001110934.r +++ /dev/null @@ -1,192 +0,0 @@ -# Script to recode present-time Likert scale items in eohi2.csv -# Recodes prePrefItem, prePersItem, and preValItem to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source columns (15 columns total) -source_cols <- c( - "prePrefItem_1", "prePrefItem_2", "prePrefItem_3", "prePrefItem_4", "prePrefItem_5", - "prePersItem_1", "prePersItem_2", "prePersItem_3", "prePersItem_4", "prePersItem_5", - "preValItem_1", "preValItem_2", "preValItem_3", "preValItem_4", "preValItem_5" -) - -# Define target column names (15 columns total) -target_cols <- c( - "present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel", - "present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex", - "present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 15 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0) { - cat("\n Missing columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_source) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - for (miss_col in missing_source) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 7) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Check that target columns exist in the dataframe -cat("\n=== CHECKING TARGET COLUMNS ===\n") -existing_targets <- target_cols[target_cols %in% df_cols] -missing_targets <- target_cols[!target_cols %in% df_cols] - -cat("Target Columns (should already exist):\n") -cat(" Expected: 15 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns do NOT exist:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Target columns missing! Cannot overwrite.") -} - -cat(" All target columns found. Will overwrite with recoded values.\n\n") - -# Process each column (overwrite existing target columns with recoded values) -for (i in 1:15) { - source_col <- source_cols[i] - target_col <- target_cols[i] - - # Get values from source column, handling missing columns - source_vals <- if (source_col %in% names(df)) df[[source_col]] else rep(NA, nrow(df)) - - # Recode to numeric and overwrite existing target column - df[[target_col]] <- recode_likert(source_vals) - - # Print progress - cat("Processed:", target_col, "\n") -} - -cat("\n=== RECODING COMPLETE ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 15 columns - for (i in 1:15) { - source_col <- source_cols[i] - target_col <- target_cols[i] - - # Get values - source_val <- if (source_col %in% names(df)) df[random_row, source_col] else "" - target_val <- df[random_row, target_col] - - # Determine if source has a value - has_val <- !is.na(source_val) && source_val != "" - - original_text <- if (has_val) source_val else "(empty)" - - # Print the info - cat(sprintf("Column %2d:\n", i)) - cat(sprintf(" Source: %-30s\n", source_col)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Original text: '%s'\n", original_text)) - cat(sprintf(" Numeric value: %s\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 189 to save changes.\n") -cat("\nProcessing complete! 15 new columns created (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode present VARS_20251001110936.r b/.history/eohi2/dataP - recode present VARS_20251001110936.r deleted file mode 100644 index 02a4b9d..0000000 --- a/.history/eohi2/dataP - recode present VARS_20251001110936.r +++ /dev/null @@ -1,192 +0,0 @@ -# Script to recode present-time Likert scale items in eohi2.csv -# Recodes prePrefItem, prePersItem, and preValItem to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source columns (15 columns total) -source_cols <- c( - "prePrefItem_1", "prePrefItem_2", "prePrefItem_3", "prePrefItem_4", "prePrefItem_5", - "prePersItem_1", "prePersItem_2", "prePersItem_3", "prePersItem_4", "prePersItem_5", - "preValItem_1", "preValItem_2", "preValItem_3", "preValItem_4", "preValItem_5" -) - -# Define target column names (15 columns total) -target_cols <- c( - "present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel", - "present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex", - "present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 15 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0) { - cat("\n Missing columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_source) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - for (miss_col in missing_source) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 7) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Check that target columns exist in the dataframe -cat("\n=== CHECKING TARGET COLUMNS ===\n") -existing_targets <- target_cols[target_cols %in% df_cols] -missing_targets <- target_cols[!target_cols %in% df_cols] - -cat("Target Columns (should already exist):\n") -cat(" Expected: 15 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns do NOT exist:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Target columns missing! Cannot overwrite.") -} - -cat(" All target columns found. Will overwrite with recoded values.\n\n") - -# Process each column (overwrite existing target columns with recoded values) -for (i in 1:15) { - source_col <- source_cols[i] - target_col <- target_cols[i] - - # Get values from source column, handling missing columns - source_vals <- if (source_col %in% names(df)) df[[source_col]] else rep(NA, nrow(df)) - - # Recode to numeric and overwrite existing target column - df[[target_col]] <- recode_likert(source_vals) - - # Print progress - cat("Processed:", target_col, "\n") -} - -cat("\n=== RECODING COMPLETE ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 15 columns - for (i in 1:15) { - source_col <- source_cols[i] - target_col <- target_cols[i] - - # Get values - source_val <- if (source_col %in% names(df)) df[random_row, source_col] else "" - target_val <- df[random_row, target_col] - - # Determine if source has a value - has_val <- !is.na(source_val) && source_val != "" - - original_text <- if (has_val) source_val else "(empty)" - - # Print the info - cat(sprintf("Column %2d:\n", i)) - cat(sprintf(" Source: %-30s\n", source_col)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Original text: '%s'\n", original_text)) - cat(sprintf(" Numeric value: %s\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 189 to save changes.\n") -cat("\nProcessing complete! 15 new columns created (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode present VARS_20251001111008.r b/.history/eohi2/dataP - recode present VARS_20251001111008.r deleted file mode 100644 index 2ee7d61..0000000 --- a/.history/eohi2/dataP - recode present VARS_20251001111008.r +++ /dev/null @@ -1,192 +0,0 @@ -# Script to recode present-time Likert scale items in eohi2.csv -# Recodes prePrefItem, prePersItem, and preValItem to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source columns (15 columns total) -source_cols <- c( - "prePrefItem_1", "prePrefItem_2", "prePrefItem_3", "prePrefItem_4", "prePrefItem_5", - "prePersItem_1", "prePersItem_2", "prePersItem_3", "prePersItem_4", "prePersItem_5", - "preValItem_1", "preValItem_2", "preValItem_3", "preValItem_4", "preValItem_5" -) - -# Define target column names (15 columns total) -target_cols <- c( - "present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel", - "present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex", - "present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 15 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0) { - cat("\n Missing columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_source) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - for (miss_col in missing_source) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 7) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Check if target columns exist in the dataframe -cat("\n=== CHECKING TARGET COLUMNS ===\n") -existing_targets <- target_cols[target_cols %in% df_cols] -missing_targets <- target_cols[!target_cols %in% df_cols] - -cat("Target Columns:\n") -cat(" Expected: 15 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with recoded values.\n") -} -cat("\n") - -# Process each column (overwrite existing target columns with recoded values) -for (i in 1:15) { - source_col <- source_cols[i] - target_col <- target_cols[i] - - # Get values from source column, handling missing columns - source_vals <- if (source_col %in% names(df)) df[[source_col]] else rep(NA, nrow(df)) - - # Recode to numeric and overwrite existing target column - df[[target_col]] <- recode_likert(source_vals) - - # Print progress - cat("Processed:", target_col, "\n") -} - -cat("\n=== RECODING COMPLETE ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 15 columns - for (i in 1:15) { - source_col <- source_cols[i] - target_col <- target_cols[i] - - # Get values - source_val <- if (source_col %in% names(df)) df[random_row, source_col] else "" - target_val <- df[random_row, target_col] - - # Determine if source has a value - has_val <- !is.na(source_val) && source_val != "" - - original_text <- if (has_val) source_val else "(empty)" - - # Print the info - cat(sprintf("Column %2d:\n", i)) - cat(sprintf(" Source: %-30s\n", source_col)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Original text: '%s'\n", original_text)) - cat(sprintf(" Numeric value: %s\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 189 to save changes.\n") -cat("\nProcessing complete! 15 new columns created (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode present VARS_20251001111014.r b/.history/eohi2/dataP - recode present VARS_20251001111014.r deleted file mode 100644 index 2ee7d61..0000000 --- a/.history/eohi2/dataP - recode present VARS_20251001111014.r +++ /dev/null @@ -1,192 +0,0 @@ -# Script to recode present-time Likert scale items in eohi2.csv -# Recodes prePrefItem, prePersItem, and preValItem to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source columns (15 columns total) -source_cols <- c( - "prePrefItem_1", "prePrefItem_2", "prePrefItem_3", "prePrefItem_4", "prePrefItem_5", - "prePersItem_1", "prePersItem_2", "prePersItem_3", "prePersItem_4", "prePersItem_5", - "preValItem_1", "preValItem_2", "preValItem_3", "preValItem_4", "preValItem_5" -) - -# Define target column names (15 columns total) -target_cols <- c( - "present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel", - "present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex", - "present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 15 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0) { - cat("\n Missing columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_source) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - for (miss_col in missing_source) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 7) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Check if target columns exist in the dataframe -cat("\n=== CHECKING TARGET COLUMNS ===\n") -existing_targets <- target_cols[target_cols %in% df_cols] -missing_targets <- target_cols[!target_cols %in% df_cols] - -cat("Target Columns:\n") -cat(" Expected: 15 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with recoded values.\n") -} -cat("\n") - -# Process each column (overwrite existing target columns with recoded values) -for (i in 1:15) { - source_col <- source_cols[i] - target_col <- target_cols[i] - - # Get values from source column, handling missing columns - source_vals <- if (source_col %in% names(df)) df[[source_col]] else rep(NA, nrow(df)) - - # Recode to numeric and overwrite existing target column - df[[target_col]] <- recode_likert(source_vals) - - # Print progress - cat("Processed:", target_col, "\n") -} - -cat("\n=== RECODING COMPLETE ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 15 columns - for (i in 1:15) { - source_col <- source_cols[i] - target_col <- target_cols[i] - - # Get values - source_val <- if (source_col %in% names(df)) df[random_row, source_col] else "" - target_val <- df[random_row, target_col] - - # Determine if source has a value - has_val <- !is.na(source_val) && source_val != "" - - original_text <- if (has_val) source_val else "(empty)" - - # Print the info - cat(sprintf("Column %2d:\n", i)) - cat(sprintf(" Source: %-30s\n", source_col)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Original text: '%s'\n", original_text)) - cat(sprintf(" Numeric value: %s\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 189 to save changes.\n") -cat("\nProcessing complete! 15 new columns created (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode present VARS_20251001111106.r b/.history/eohi2/dataP - recode present VARS_20251001111106.r deleted file mode 100644 index 2ee7d61..0000000 --- a/.history/eohi2/dataP - recode present VARS_20251001111106.r +++ /dev/null @@ -1,192 +0,0 @@ -# Script to recode present-time Likert scale items in eohi2.csv -# Recodes prePrefItem, prePersItem, and preValItem to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source columns (15 columns total) -source_cols <- c( - "prePrefItem_1", "prePrefItem_2", "prePrefItem_3", "prePrefItem_4", "prePrefItem_5", - "prePersItem_1", "prePersItem_2", "prePersItem_3", "prePersItem_4", "prePersItem_5", - "preValItem_1", "preValItem_2", "preValItem_3", "preValItem_4", "preValItem_5" -) - -# Define target column names (15 columns total) -target_cols <- c( - "present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel", - "present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex", - "present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 15 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0) { - cat("\n Missing columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_source) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - for (miss_col in missing_source) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 7) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Check if target columns exist in the dataframe -cat("\n=== CHECKING TARGET COLUMNS ===\n") -existing_targets <- target_cols[target_cols %in% df_cols] -missing_targets <- target_cols[!target_cols %in% df_cols] - -cat("Target Columns:\n") -cat(" Expected: 15 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with recoded values.\n") -} -cat("\n") - -# Process each column (overwrite existing target columns with recoded values) -for (i in 1:15) { - source_col <- source_cols[i] - target_col <- target_cols[i] - - # Get values from source column, handling missing columns - source_vals <- if (source_col %in% names(df)) df[[source_col]] else rep(NA, nrow(df)) - - # Recode to numeric and overwrite existing target column - df[[target_col]] <- recode_likert(source_vals) - - # Print progress - cat("Processed:", target_col, "\n") -} - -cat("\n=== RECODING COMPLETE ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 15 columns - for (i in 1:15) { - source_col <- source_cols[i] - target_col <- target_cols[i] - - # Get values - source_val <- if (source_col %in% names(df)) df[random_row, source_col] else "" - target_val <- df[random_row, target_col] - - # Determine if source has a value - has_val <- !is.na(source_val) && source_val != "" - - original_text <- if (has_val) source_val else "(empty)" - - # Print the info - cat(sprintf("Column %2d:\n", i)) - cat(sprintf(" Source: %-30s\n", source_col)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Original text: '%s'\n", original_text)) - cat(sprintf(" Numeric value: %s\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 189 to save changes.\n") -cat("\nProcessing complete! 15 new columns created (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode present VARS_20251001112101.r b/.history/eohi2/dataP - recode present VARS_20251001112101.r deleted file mode 100644 index 6adcf6d..0000000 --- a/.history/eohi2/dataP - recode present VARS_20251001112101.r +++ /dev/null @@ -1,192 +0,0 @@ -# Script to recode present-time Likert scale items in eohi2.csv -# Recodes prePrefItem, prePersItem, and preValItem to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source columns (15 columns total) -source_cols <- c( - "prePrefItem_1", "prePrefItem_2", "prePrefItem_3", "prePrefItem_4", "prePrefItem_5", - "prePersItem_1", "prePersItem_2", "prePersItem_3", "prePersItem_4", "prePersItem_5", - "preValItem_1", "preValItem_2", "preValItem_3", "preValItem_4", "preValItem_5" -) - -# Define target column names (15 columns total) -target_cols <- c( - "present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel", - "present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex", - "present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 15 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0) { - cat("\n Missing columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_source) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - for (miss_col in missing_source) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 7) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Check if target columns exist in the dataframe -cat("\n=== CHECKING TARGET COLUMNS ===\n") -existing_targets <- target_cols[target_cols %in% df_cols] -missing_targets <- target_cols[!target_cols %in% df_cols] - -cat("Target Columns:\n") -cat(" Expected: 15 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with recoded values.\n") -} -cat("\n") - -# Process each column (overwrite existing target columns with recoded values) -for (i in 1:15) { - source_col <- source_cols[i] - target_col <- target_cols[i] - - # Get values from source column, handling missing columns - source_vals <- if (source_col %in% names(df)) df[[source_col]] else rep(NA, nrow(df)) - - # Recode to numeric and overwrite existing target column - df[[target_col]] <- recode_likert(source_vals) - - # Print progress - cat("Processed:", target_col, "\n") -} - -cat("\n=== RECODING COMPLETE ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 15 columns - for (i in 1:15) { - source_col <- source_cols[i] - target_col <- target_cols[i] - - # Get values - source_val <- if (source_col %in% names(df)) df[random_row, source_col] else "" - target_val <- df[random_row, target_col] - - # Determine if source has a value - has_val <- !is.na(source_val) && source_val != "" - - original_text <- if (has_val) source_val else "(empty)" - - # Print the info - cat(sprintf("Column %2d:\n", i)) - cat(sprintf(" Source: %-30s\n", source_col)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Original text: '%s'\n", original_text)) - cat(sprintf(" Numeric value: %s\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 189 to save changes.\n") -cat("\nProcessing complete! 15 new columns created (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode scales VARS_20251001113407.r b/.history/eohi2/dataP - recode scales VARS_20251001113407.r deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi2/dataP - recode scales VARS_20251001113436.r b/.history/eohi2/dataP - recode scales VARS_20251001113436.r deleted file mode 100644 index 979b420..0000000 --- a/.history/eohi2/dataP - recode scales VARS_20251001113436.r +++ /dev/null @@ -1,4 +0,0 @@ -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") \ No newline at end of file diff --git a/.history/eohi2/dataP - recode scales VARS_20251001114903.r b/.history/eohi2/dataP - recode scales VARS_20251001114903.r deleted file mode 100644 index d2611e3..0000000 --- a/.history/eohi2/dataP - recode scales VARS_20251001114903.r +++ /dev/null @@ -1,281 +0,0 @@ -# Script to compute AOT and CRT scales in eohi2.csv -# AOT: Reverse codes items 4-7, then averages all 8 items -# CRT: Calculates proportion of correct and intuitive responses - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source columns -aot_cols <- c("aot_1", "aot_2", "aot_3", "aot_4", "aot_5", "aot_6", "aot_7", "aot_8") -crt_cols <- c("crt_1", "crt_2", "crt_3") - -# Define target columns -target_cols <- c("aot_total", "crt_correct", "crt_int") - -# Define correct and intuitive CRT answers -crt_correct_answers <- c("5 cents", "5 minutes", "47 days") -crt_intuitive_answers <- c("10 cents", "100 minutes", "24 days") - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check AOT columns -missing_aot <- aot_cols[!aot_cols %in% df_cols] -existing_aot <- aot_cols[aot_cols %in% df_cols] - -cat("AOT Source Columns:\n") -cat(" Expected: 8 columns\n") -cat(" Found:", length(existing_aot), "columns\n") -cat(" Missing:", length(missing_aot), "columns\n") - -if (length(missing_aot) > 0) { - cat("\n Missing AOT columns:\n") - for (col in missing_aot) { - cat(" -", col, "\n") - } -} - -# Check CRT columns -missing_crt <- crt_cols[!crt_cols %in% df_cols] -existing_crt <- crt_cols[crt_cols %in% df_cols] - -cat("\nCRT Source Columns:\n") -cat(" Expected: 3 columns\n") -cat(" Found:", length(existing_crt), "columns\n") -cat(" Missing:", length(missing_crt), "columns\n") - -if (length(missing_crt) > 0) { - cat("\n Missing CRT columns:\n") - for (col in missing_crt) { - cat(" -", col, "\n") - } -} - -# Check target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 3 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Missing target columns:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_aot) > 4 || length(missing_crt) > 1 || length(missing_targets) > 1) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= PROCESS AOT SCALE ============= -cat("Processing AOT scale...\n") - -# Convert AOT columns to numeric (handling any non-numeric values) -for (col in aot_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Reverse code items 4, 5, 6, 7 (multiply by -1) -reverse_items <- c("aot_4", "aot_5", "aot_6", "aot_7") -for (col in reverse_items) { - if (col %in% names(df)) { - df[[col]] <- df[[col]] * -1 - } -} - -# Calculate average of all 8 AOT items -df$aot_total <- rowMeans(df[, aot_cols[aot_cols %in% names(df)]], na.rm = TRUE) - -cat(" AOT total scores calculated (items 4-7 reverse coded).\n\n") - -# ============= PROCESS CRT SCALES ============= -cat("Processing CRT scales...\n") - -# Initialize CRT columns -df$crt_correct <- NA -df$crt_int <- NA - -# Process each row -for (i in 1:nrow(df)) { - # CRT Correct - crt_correct_count <- 0 - crt_correct_n <- 0 - - for (j in 1:3) { - col <- crt_cols[j] - if (col %in% names(df)) { - response <- trimws(tolower(as.character(df[i, col]))) - correct_answer <- tolower(crt_correct_answers[j]) - - if (!is.na(response) && response != "") { - crt_correct_n <- crt_correct_n + 1 - if (response == correct_answer) { - crt_correct_count <- crt_correct_count + 1 - } - } - } - } - - # Calculate proportion correct - if (crt_correct_n > 0) { - df$crt_correct[i] <- crt_correct_count / crt_correct_n - } - - # CRT Intuitive - crt_int_count <- 0 - crt_int_n <- 0 - - for (j in 1:3) { - col <- crt_cols[j] - if (col %in% names(df)) { - response <- trimws(tolower(as.character(df[i, col]))) - intuitive_answer <- tolower(crt_intuitive_answers[j]) - - if (!is.na(response) && response != "") { - crt_int_n <- crt_int_n + 1 - if (response == intuitive_answer) { - crt_int_count <- crt_int_count + 1 - } - } - } - } - - # Calculate proportion intuitive - if (crt_int_n > 0) { - df$crt_int[i] <- crt_int_count / crt_int_n - } -} - -cat(" CRT correct and intuitive scores calculated.\n\n") - -cat("=== PROCESSING COMPLETE ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # AOT Check - cat("--- AOT SCALE ---\n") - cat("Source values:\n") - aot_values <- numeric(8) - for (i in 1:8) { - col <- aot_cols[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - aot_values[i] <- val - reverse_note <- if (i %in% 4:7) " (REVERSED)" else "" - cat(sprintf(" %s: %s%s\n", col, ifelse(is.na(val), "NA", as.character(val)), reverse_note)) - } - - # Manual calculation check - valid_aot <- aot_values[!is.na(aot_values)] - if (length(valid_aot) > 0) { - expected_mean <- mean(valid_aot) - actual_value <- df$aot_total[random_row] - cat(sprintf("\nCalculation check:\n")) - cat(sprintf(" Average of %d valid items: %.5f\n", length(valid_aot), expected_mean)) - cat(sprintf(" Target value (aot_total): %.5f\n", actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat("\n No valid AOT values to calculate.\n") - } - - # CRT Check - cat("\n--- CRT SCALE ---\n") - cat("Source values:\n") - crt_correct_count <- 0 - crt_int_count <- 0 - crt_n <- 0 - - for (i in 1:3) { - col <- crt_cols[i] - val <- if (col %in% names(df)) as.character(df[random_row, col]) else "" - val_trimmed <- trimws(tolower(val)) - - correct_ans <- crt_correct_answers[i] - intuitive_ans <- crt_intuitive_answers[i] - - is_correct <- val_trimmed == tolower(correct_ans) - is_intuitive <- val_trimmed == tolower(intuitive_ans) - - if (val_trimmed != "" && !is.na(val_trimmed)) { - crt_n <- crt_n + 1 - if (is_correct) crt_correct_count <- crt_correct_count + 1 - if (is_intuitive) crt_int_count <- crt_int_count + 1 - } - - cat(sprintf(" %s: '%s'\n", col, val)) - cat(sprintf(" Correct answer: '%s' -> %s\n", correct_ans, ifelse(is_correct, "CORRECT ✓", "Not correct"))) - cat(sprintf(" Intuitive answer: '%s' -> %s\n", intuitive_ans, ifelse(is_intuitive, "INTUITIVE ✓", "Not intuitive"))) - } - - cat("\nCalculation check:\n") - if (crt_n > 0) { - expected_correct <- crt_correct_count / crt_n - expected_int <- crt_int_count / crt_n - actual_correct <- df$crt_correct[random_row] - actual_int <- df$crt_int[random_row] - - cat(sprintf(" Correct: %d out of %d = %.5f\n", crt_correct_count, crt_n, expected_correct)) - cat(sprintf(" Target value (crt_correct): %.5f\n", actual_correct)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_correct - actual_correct) < 0.0001, "YES ✓", "NO ✗"))) - - cat(sprintf("\n Intuitive: %d out of %d = %.5f\n", crt_int_count, crt_n, expected_int)) - cat(sprintf(" Target value (crt_int): %.5f\n", actual_int)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_int - actual_int) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid CRT responses to calculate.\n") - } - - cat("\n========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 253 to save changes.\n") -cat("\nProcessing complete! AOT and CRT scales calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode scales VARS_20251001114914.r b/.history/eohi2/dataP - recode scales VARS_20251001114914.r deleted file mode 100644 index d2611e3..0000000 --- a/.history/eohi2/dataP - recode scales VARS_20251001114914.r +++ /dev/null @@ -1,281 +0,0 @@ -# Script to compute AOT and CRT scales in eohi2.csv -# AOT: Reverse codes items 4-7, then averages all 8 items -# CRT: Calculates proportion of correct and intuitive responses - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source columns -aot_cols <- c("aot_1", "aot_2", "aot_3", "aot_4", "aot_5", "aot_6", "aot_7", "aot_8") -crt_cols <- c("crt_1", "crt_2", "crt_3") - -# Define target columns -target_cols <- c("aot_total", "crt_correct", "crt_int") - -# Define correct and intuitive CRT answers -crt_correct_answers <- c("5 cents", "5 minutes", "47 days") -crt_intuitive_answers <- c("10 cents", "100 minutes", "24 days") - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check AOT columns -missing_aot <- aot_cols[!aot_cols %in% df_cols] -existing_aot <- aot_cols[aot_cols %in% df_cols] - -cat("AOT Source Columns:\n") -cat(" Expected: 8 columns\n") -cat(" Found:", length(existing_aot), "columns\n") -cat(" Missing:", length(missing_aot), "columns\n") - -if (length(missing_aot) > 0) { - cat("\n Missing AOT columns:\n") - for (col in missing_aot) { - cat(" -", col, "\n") - } -} - -# Check CRT columns -missing_crt <- crt_cols[!crt_cols %in% df_cols] -existing_crt <- crt_cols[crt_cols %in% df_cols] - -cat("\nCRT Source Columns:\n") -cat(" Expected: 3 columns\n") -cat(" Found:", length(existing_crt), "columns\n") -cat(" Missing:", length(missing_crt), "columns\n") - -if (length(missing_crt) > 0) { - cat("\n Missing CRT columns:\n") - for (col in missing_crt) { - cat(" -", col, "\n") - } -} - -# Check target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 3 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Missing target columns:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_aot) > 4 || length(missing_crt) > 1 || length(missing_targets) > 1) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= PROCESS AOT SCALE ============= -cat("Processing AOT scale...\n") - -# Convert AOT columns to numeric (handling any non-numeric values) -for (col in aot_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Reverse code items 4, 5, 6, 7 (multiply by -1) -reverse_items <- c("aot_4", "aot_5", "aot_6", "aot_7") -for (col in reverse_items) { - if (col %in% names(df)) { - df[[col]] <- df[[col]] * -1 - } -} - -# Calculate average of all 8 AOT items -df$aot_total <- rowMeans(df[, aot_cols[aot_cols %in% names(df)]], na.rm = TRUE) - -cat(" AOT total scores calculated (items 4-7 reverse coded).\n\n") - -# ============= PROCESS CRT SCALES ============= -cat("Processing CRT scales...\n") - -# Initialize CRT columns -df$crt_correct <- NA -df$crt_int <- NA - -# Process each row -for (i in 1:nrow(df)) { - # CRT Correct - crt_correct_count <- 0 - crt_correct_n <- 0 - - for (j in 1:3) { - col <- crt_cols[j] - if (col %in% names(df)) { - response <- trimws(tolower(as.character(df[i, col]))) - correct_answer <- tolower(crt_correct_answers[j]) - - if (!is.na(response) && response != "") { - crt_correct_n <- crt_correct_n + 1 - if (response == correct_answer) { - crt_correct_count <- crt_correct_count + 1 - } - } - } - } - - # Calculate proportion correct - if (crt_correct_n > 0) { - df$crt_correct[i] <- crt_correct_count / crt_correct_n - } - - # CRT Intuitive - crt_int_count <- 0 - crt_int_n <- 0 - - for (j in 1:3) { - col <- crt_cols[j] - if (col %in% names(df)) { - response <- trimws(tolower(as.character(df[i, col]))) - intuitive_answer <- tolower(crt_intuitive_answers[j]) - - if (!is.na(response) && response != "") { - crt_int_n <- crt_int_n + 1 - if (response == intuitive_answer) { - crt_int_count <- crt_int_count + 1 - } - } - } - } - - # Calculate proportion intuitive - if (crt_int_n > 0) { - df$crt_int[i] <- crt_int_count / crt_int_n - } -} - -cat(" CRT correct and intuitive scores calculated.\n\n") - -cat("=== PROCESSING COMPLETE ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # AOT Check - cat("--- AOT SCALE ---\n") - cat("Source values:\n") - aot_values <- numeric(8) - for (i in 1:8) { - col <- aot_cols[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - aot_values[i] <- val - reverse_note <- if (i %in% 4:7) " (REVERSED)" else "" - cat(sprintf(" %s: %s%s\n", col, ifelse(is.na(val), "NA", as.character(val)), reverse_note)) - } - - # Manual calculation check - valid_aot <- aot_values[!is.na(aot_values)] - if (length(valid_aot) > 0) { - expected_mean <- mean(valid_aot) - actual_value <- df$aot_total[random_row] - cat(sprintf("\nCalculation check:\n")) - cat(sprintf(" Average of %d valid items: %.5f\n", length(valid_aot), expected_mean)) - cat(sprintf(" Target value (aot_total): %.5f\n", actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat("\n No valid AOT values to calculate.\n") - } - - # CRT Check - cat("\n--- CRT SCALE ---\n") - cat("Source values:\n") - crt_correct_count <- 0 - crt_int_count <- 0 - crt_n <- 0 - - for (i in 1:3) { - col <- crt_cols[i] - val <- if (col %in% names(df)) as.character(df[random_row, col]) else "" - val_trimmed <- trimws(tolower(val)) - - correct_ans <- crt_correct_answers[i] - intuitive_ans <- crt_intuitive_answers[i] - - is_correct <- val_trimmed == tolower(correct_ans) - is_intuitive <- val_trimmed == tolower(intuitive_ans) - - if (val_trimmed != "" && !is.na(val_trimmed)) { - crt_n <- crt_n + 1 - if (is_correct) crt_correct_count <- crt_correct_count + 1 - if (is_intuitive) crt_int_count <- crt_int_count + 1 - } - - cat(sprintf(" %s: '%s'\n", col, val)) - cat(sprintf(" Correct answer: '%s' -> %s\n", correct_ans, ifelse(is_correct, "CORRECT ✓", "Not correct"))) - cat(sprintf(" Intuitive answer: '%s' -> %s\n", intuitive_ans, ifelse(is_intuitive, "INTUITIVE ✓", "Not intuitive"))) - } - - cat("\nCalculation check:\n") - if (crt_n > 0) { - expected_correct <- crt_correct_count / crt_n - expected_int <- crt_int_count / crt_n - actual_correct <- df$crt_correct[random_row] - actual_int <- df$crt_int[random_row] - - cat(sprintf(" Correct: %d out of %d = %.5f\n", crt_correct_count, crt_n, expected_correct)) - cat(sprintf(" Target value (crt_correct): %.5f\n", actual_correct)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_correct - actual_correct) < 0.0001, "YES ✓", "NO ✗"))) - - cat(sprintf("\n Intuitive: %d out of %d = %.5f\n", crt_int_count, crt_n, expected_int)) - cat(sprintf(" Target value (crt_int): %.5f\n", actual_int)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_int - actual_int) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid CRT responses to calculate.\n") - } - - cat("\n========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 253 to save changes.\n") -cat("\nProcessing complete! AOT and CRT scales calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode scales VARS_20251001115233.r b/.history/eohi2/dataP - recode scales VARS_20251001115233.r deleted file mode 100644 index d2611e3..0000000 --- a/.history/eohi2/dataP - recode scales VARS_20251001115233.r +++ /dev/null @@ -1,281 +0,0 @@ -# Script to compute AOT and CRT scales in eohi2.csv -# AOT: Reverse codes items 4-7, then averages all 8 items -# CRT: Calculates proportion of correct and intuitive responses - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source columns -aot_cols <- c("aot_1", "aot_2", "aot_3", "aot_4", "aot_5", "aot_6", "aot_7", "aot_8") -crt_cols <- c("crt_1", "crt_2", "crt_3") - -# Define target columns -target_cols <- c("aot_total", "crt_correct", "crt_int") - -# Define correct and intuitive CRT answers -crt_correct_answers <- c("5 cents", "5 minutes", "47 days") -crt_intuitive_answers <- c("10 cents", "100 minutes", "24 days") - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check AOT columns -missing_aot <- aot_cols[!aot_cols %in% df_cols] -existing_aot <- aot_cols[aot_cols %in% df_cols] - -cat("AOT Source Columns:\n") -cat(" Expected: 8 columns\n") -cat(" Found:", length(existing_aot), "columns\n") -cat(" Missing:", length(missing_aot), "columns\n") - -if (length(missing_aot) > 0) { - cat("\n Missing AOT columns:\n") - for (col in missing_aot) { - cat(" -", col, "\n") - } -} - -# Check CRT columns -missing_crt <- crt_cols[!crt_cols %in% df_cols] -existing_crt <- crt_cols[crt_cols %in% df_cols] - -cat("\nCRT Source Columns:\n") -cat(" Expected: 3 columns\n") -cat(" Found:", length(existing_crt), "columns\n") -cat(" Missing:", length(missing_crt), "columns\n") - -if (length(missing_crt) > 0) { - cat("\n Missing CRT columns:\n") - for (col in missing_crt) { - cat(" -", col, "\n") - } -} - -# Check target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 3 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Missing target columns:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_aot) > 4 || length(missing_crt) > 1 || length(missing_targets) > 1) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= PROCESS AOT SCALE ============= -cat("Processing AOT scale...\n") - -# Convert AOT columns to numeric (handling any non-numeric values) -for (col in aot_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Reverse code items 4, 5, 6, 7 (multiply by -1) -reverse_items <- c("aot_4", "aot_5", "aot_6", "aot_7") -for (col in reverse_items) { - if (col %in% names(df)) { - df[[col]] <- df[[col]] * -1 - } -} - -# Calculate average of all 8 AOT items -df$aot_total <- rowMeans(df[, aot_cols[aot_cols %in% names(df)]], na.rm = TRUE) - -cat(" AOT total scores calculated (items 4-7 reverse coded).\n\n") - -# ============= PROCESS CRT SCALES ============= -cat("Processing CRT scales...\n") - -# Initialize CRT columns -df$crt_correct <- NA -df$crt_int <- NA - -# Process each row -for (i in 1:nrow(df)) { - # CRT Correct - crt_correct_count <- 0 - crt_correct_n <- 0 - - for (j in 1:3) { - col <- crt_cols[j] - if (col %in% names(df)) { - response <- trimws(tolower(as.character(df[i, col]))) - correct_answer <- tolower(crt_correct_answers[j]) - - if (!is.na(response) && response != "") { - crt_correct_n <- crt_correct_n + 1 - if (response == correct_answer) { - crt_correct_count <- crt_correct_count + 1 - } - } - } - } - - # Calculate proportion correct - if (crt_correct_n > 0) { - df$crt_correct[i] <- crt_correct_count / crt_correct_n - } - - # CRT Intuitive - crt_int_count <- 0 - crt_int_n <- 0 - - for (j in 1:3) { - col <- crt_cols[j] - if (col %in% names(df)) { - response <- trimws(tolower(as.character(df[i, col]))) - intuitive_answer <- tolower(crt_intuitive_answers[j]) - - if (!is.na(response) && response != "") { - crt_int_n <- crt_int_n + 1 - if (response == intuitive_answer) { - crt_int_count <- crt_int_count + 1 - } - } - } - } - - # Calculate proportion intuitive - if (crt_int_n > 0) { - df$crt_int[i] <- crt_int_count / crt_int_n - } -} - -cat(" CRT correct and intuitive scores calculated.\n\n") - -cat("=== PROCESSING COMPLETE ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # AOT Check - cat("--- AOT SCALE ---\n") - cat("Source values:\n") - aot_values <- numeric(8) - for (i in 1:8) { - col <- aot_cols[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - aot_values[i] <- val - reverse_note <- if (i %in% 4:7) " (REVERSED)" else "" - cat(sprintf(" %s: %s%s\n", col, ifelse(is.na(val), "NA", as.character(val)), reverse_note)) - } - - # Manual calculation check - valid_aot <- aot_values[!is.na(aot_values)] - if (length(valid_aot) > 0) { - expected_mean <- mean(valid_aot) - actual_value <- df$aot_total[random_row] - cat(sprintf("\nCalculation check:\n")) - cat(sprintf(" Average of %d valid items: %.5f\n", length(valid_aot), expected_mean)) - cat(sprintf(" Target value (aot_total): %.5f\n", actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat("\n No valid AOT values to calculate.\n") - } - - # CRT Check - cat("\n--- CRT SCALE ---\n") - cat("Source values:\n") - crt_correct_count <- 0 - crt_int_count <- 0 - crt_n <- 0 - - for (i in 1:3) { - col <- crt_cols[i] - val <- if (col %in% names(df)) as.character(df[random_row, col]) else "" - val_trimmed <- trimws(tolower(val)) - - correct_ans <- crt_correct_answers[i] - intuitive_ans <- crt_intuitive_answers[i] - - is_correct <- val_trimmed == tolower(correct_ans) - is_intuitive <- val_trimmed == tolower(intuitive_ans) - - if (val_trimmed != "" && !is.na(val_trimmed)) { - crt_n <- crt_n + 1 - if (is_correct) crt_correct_count <- crt_correct_count + 1 - if (is_intuitive) crt_int_count <- crt_int_count + 1 - } - - cat(sprintf(" %s: '%s'\n", col, val)) - cat(sprintf(" Correct answer: '%s' -> %s\n", correct_ans, ifelse(is_correct, "CORRECT ✓", "Not correct"))) - cat(sprintf(" Intuitive answer: '%s' -> %s\n", intuitive_ans, ifelse(is_intuitive, "INTUITIVE ✓", "Not intuitive"))) - } - - cat("\nCalculation check:\n") - if (crt_n > 0) { - expected_correct <- crt_correct_count / crt_n - expected_int <- crt_int_count / crt_n - actual_correct <- df$crt_correct[random_row] - actual_int <- df$crt_int[random_row] - - cat(sprintf(" Correct: %d out of %d = %.5f\n", crt_correct_count, crt_n, expected_correct)) - cat(sprintf(" Target value (crt_correct): %.5f\n", actual_correct)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_correct - actual_correct) < 0.0001, "YES ✓", "NO ✗"))) - - cat(sprintf("\n Intuitive: %d out of %d = %.5f\n", crt_int_count, crt_n, expected_int)) - cat(sprintf(" Target value (crt_int): %.5f\n", actual_int)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_int - actual_int) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid CRT responses to calculate.\n") - } - - cat("\n========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 253 to save changes.\n") -cat("\nProcessing complete! AOT and CRT scales calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode scales VARS_20251001120138.r b/.history/eohi2/dataP - recode scales VARS_20251001120138.r deleted file mode 100644 index b2bc32f..0000000 --- a/.history/eohi2/dataP - recode scales VARS_20251001120138.r +++ /dev/null @@ -1,285 +0,0 @@ -# Script to compute AOT and CRT scales in eohi2.csv -# AOT: Reverse codes items 4-7, then averages all 8 items -# CRT: Calculates proportion of correct and intuitive responses - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source columns -aot_cols <- c("aot_1", "aot_2", "aot_3", "aot_4", "aot_5", "aot_6", "aot_7", "aot_8") -crt_cols <- c("crt_1", "crt_2", "crt_3") - -# Define target columns -target_cols <- c("aot_total", "crt_correct", "crt_int") - -# Define correct and intuitive CRT answers -crt_correct_answers <- c("5 cents", "5 minutes", "47 days") -crt_intuitive_answers <- c("10 cents", "100 minutes", "24 days") - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check AOT columns -missing_aot <- aot_cols[!aot_cols %in% df_cols] -existing_aot <- aot_cols[aot_cols %in% df_cols] - -cat("AOT Source Columns:\n") -cat(" Expected: 8 columns\n") -cat(" Found:", length(existing_aot), "columns\n") -cat(" Missing:", length(missing_aot), "columns\n") - -if (length(missing_aot) > 0) { - cat("\n Missing AOT columns:\n") - for (col in missing_aot) { - cat(" -", col, "\n") - } -} - -# Check CRT columns -missing_crt <- crt_cols[!crt_cols %in% df_cols] -existing_crt <- crt_cols[crt_cols %in% df_cols] - -cat("\nCRT Source Columns:\n") -cat(" Expected: 3 columns\n") -cat(" Found:", length(existing_crt), "columns\n") -cat(" Missing:", length(missing_crt), "columns\n") - -if (length(missing_crt) > 0) { - cat("\n Missing CRT columns:\n") - for (col in missing_crt) { - cat(" -", col, "\n") - } -} - -# Check target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 3 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Missing target columns:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_aot) > 4 || length(missing_crt) > 1 || length(missing_targets) > 1) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= PROCESS AOT SCALE ============= -cat("Processing AOT scale...\n") - -# Convert AOT columns to numeric (handling any non-numeric values) -for (col in aot_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Calculate average with reverse coding (WITHOUT modifying original values) -# Items 4, 5, 6, 7 are reverse coded for calculation only -df$aot_total <- apply(df[, aot_cols[aot_cols %in% names(df)], drop = FALSE], 1, function(row) { - # Create a copy for calculation - values <- as.numeric(row) - - # Reverse items 4, 5, 6, 7 (positions in aot_cols vector) - reverse_positions <- c(4, 5, 6, 7) - values[reverse_positions] <- values[reverse_positions] * -1 - - # Return mean (na.rm = TRUE handles missing values) - mean(values, na.rm = TRUE) -}) - -cat(" AOT total scores calculated (items 4-7 reverse coded for calculation only).\n") -cat(" Original AOT item values preserved in dataframe.\n\n") - -# ============= PROCESS CRT SCALES ============= -cat("Processing CRT scales...\n") - -# Initialize CRT columns -df$crt_correct <- NA -df$crt_int <- NA - -# Process each row -for (i in 1:nrow(df)) { - # CRT Correct - crt_correct_count <- 0 - crt_correct_n <- 0 - - for (j in 1:3) { - col <- crt_cols[j] - if (col %in% names(df)) { - response <- trimws(tolower(as.character(df[i, col]))) - correct_answer <- tolower(crt_correct_answers[j]) - - if (!is.na(response) && response != "") { - crt_correct_n <- crt_correct_n + 1 - if (response == correct_answer) { - crt_correct_count <- crt_correct_count + 1 - } - } - } - } - - # Calculate proportion correct - if (crt_correct_n > 0) { - df$crt_correct[i] <- crt_correct_count / crt_correct_n - } - - # CRT Intuitive - crt_int_count <- 0 - crt_int_n <- 0 - - for (j in 1:3) { - col <- crt_cols[j] - if (col %in% names(df)) { - response <- trimws(tolower(as.character(df[i, col]))) - intuitive_answer <- tolower(crt_intuitive_answers[j]) - - if (!is.na(response) && response != "") { - crt_int_n <- crt_int_n + 1 - if (response == intuitive_answer) { - crt_int_count <- crt_int_count + 1 - } - } - } - } - - # Calculate proportion intuitive - if (crt_int_n > 0) { - df$crt_int[i] <- crt_int_count / crt_int_n - } -} - -cat(" CRT correct and intuitive scores calculated.\n\n") - -cat("=== PROCESSING COMPLETE ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # AOT Check - cat("--- AOT SCALE ---\n") - cat("Source values:\n") - aot_values <- numeric(8) - for (i in 1:8) { - col <- aot_cols[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - aot_values[i] <- val - reverse_note <- if (i %in% 4:7) " (REVERSED)" else "" - cat(sprintf(" %s: %s%s\n", col, ifelse(is.na(val), "NA", as.character(val)), reverse_note)) - } - - # Manual calculation check - valid_aot <- aot_values[!is.na(aot_values)] - if (length(valid_aot) > 0) { - expected_mean <- mean(valid_aot) - actual_value <- df$aot_total[random_row] - cat(sprintf("\nCalculation check:\n")) - cat(sprintf(" Average of %d valid items: %.5f\n", length(valid_aot), expected_mean)) - cat(sprintf(" Target value (aot_total): %.5f\n", actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat("\n No valid AOT values to calculate.\n") - } - - # CRT Check - cat("\n--- CRT SCALE ---\n") - cat("Source values:\n") - crt_correct_count <- 0 - crt_int_count <- 0 - crt_n <- 0 - - for (i in 1:3) { - col <- crt_cols[i] - val <- if (col %in% names(df)) as.character(df[random_row, col]) else "" - val_trimmed <- trimws(tolower(val)) - - correct_ans <- crt_correct_answers[i] - intuitive_ans <- crt_intuitive_answers[i] - - is_correct <- val_trimmed == tolower(correct_ans) - is_intuitive <- val_trimmed == tolower(intuitive_ans) - - if (val_trimmed != "" && !is.na(val_trimmed)) { - crt_n <- crt_n + 1 - if (is_correct) crt_correct_count <- crt_correct_count + 1 - if (is_intuitive) crt_int_count <- crt_int_count + 1 - } - - cat(sprintf(" %s: '%s'\n", col, val)) - cat(sprintf(" Correct answer: '%s' -> %s\n", correct_ans, ifelse(is_correct, "CORRECT ✓", "Not correct"))) - cat(sprintf(" Intuitive answer: '%s' -> %s\n", intuitive_ans, ifelse(is_intuitive, "INTUITIVE ✓", "Not intuitive"))) - } - - cat("\nCalculation check:\n") - if (crt_n > 0) { - expected_correct <- crt_correct_count / crt_n - expected_int <- crt_int_count / crt_n - actual_correct <- df$crt_correct[random_row] - actual_int <- df$crt_int[random_row] - - cat(sprintf(" Correct: %d out of %d = %.5f\n", crt_correct_count, crt_n, expected_correct)) - cat(sprintf(" Target value (crt_correct): %.5f\n", actual_correct)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_correct - actual_correct) < 0.0001, "YES ✓", "NO ✗"))) - - cat(sprintf("\n Intuitive: %d out of %d = %.5f\n", crt_int_count, crt_n, expected_int)) - cat(sprintf(" Target value (crt_int): %.5f\n", actual_int)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_int - actual_int) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid CRT responses to calculate.\n") - } - - cat("\n========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 253 to save changes.\n") -cat("\nProcessing complete! AOT and CRT scales calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode scales VARS_20251001120154.r b/.history/eohi2/dataP - recode scales VARS_20251001120154.r deleted file mode 100644 index 2861737..0000000 --- a/.history/eohi2/dataP - recode scales VARS_20251001120154.r +++ /dev/null @@ -1,298 +0,0 @@ -# Script to compute AOT and CRT scales in eohi2.csv -# AOT: Reverse codes items 4-7, then averages all 8 items -# CRT: Calculates proportion of correct and intuitive responses - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source columns -aot_cols <- c("aot_1", "aot_2", "aot_3", "aot_4", "aot_5", "aot_6", "aot_7", "aot_8") -crt_cols <- c("crt_1", "crt_2", "crt_3") - -# Define target columns -target_cols <- c("aot_total", "crt_correct", "crt_int") - -# Define correct and intuitive CRT answers -crt_correct_answers <- c("5 cents", "5 minutes", "47 days") -crt_intuitive_answers <- c("10 cents", "100 minutes", "24 days") - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check AOT columns -missing_aot <- aot_cols[!aot_cols %in% df_cols] -existing_aot <- aot_cols[aot_cols %in% df_cols] - -cat("AOT Source Columns:\n") -cat(" Expected: 8 columns\n") -cat(" Found:", length(existing_aot), "columns\n") -cat(" Missing:", length(missing_aot), "columns\n") - -if (length(missing_aot) > 0) { - cat("\n Missing AOT columns:\n") - for (col in missing_aot) { - cat(" -", col, "\n") - } -} - -# Check CRT columns -missing_crt <- crt_cols[!crt_cols %in% df_cols] -existing_crt <- crt_cols[crt_cols %in% df_cols] - -cat("\nCRT Source Columns:\n") -cat(" Expected: 3 columns\n") -cat(" Found:", length(existing_crt), "columns\n") -cat(" Missing:", length(missing_crt), "columns\n") - -if (length(missing_crt) > 0) { - cat("\n Missing CRT columns:\n") - for (col in missing_crt) { - cat(" -", col, "\n") - } -} - -# Check target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 3 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Missing target columns:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_aot) > 4 || length(missing_crt) > 1 || length(missing_targets) > 1) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= PROCESS AOT SCALE ============= -cat("Processing AOT scale...\n") - -# Convert AOT columns to numeric (handling any non-numeric values) -for (col in aot_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Calculate average with reverse coding (WITHOUT modifying original values) -# Items 4, 5, 6, 7 are reverse coded for calculation only -df$aot_total <- apply(df[, aot_cols[aot_cols %in% names(df)], drop = FALSE], 1, function(row) { - # Create a copy for calculation - values <- as.numeric(row) - - # Reverse items 4, 5, 6, 7 (positions in aot_cols vector) - reverse_positions <- c(4, 5, 6, 7) - values[reverse_positions] <- values[reverse_positions] * -1 - - # Return mean (na.rm = TRUE handles missing values) - mean(values, na.rm = TRUE) -}) - -cat(" AOT total scores calculated (items 4-7 reverse coded for calculation only).\n") -cat(" Original AOT item values preserved in dataframe.\n\n") - -# ============= PROCESS CRT SCALES ============= -cat("Processing CRT scales...\n") - -# Initialize CRT columns -df$crt_correct <- NA -df$crt_int <- NA - -# Process each row -for (i in 1:nrow(df)) { - # CRT Correct - crt_correct_count <- 0 - crt_correct_n <- 0 - - for (j in 1:3) { - col <- crt_cols[j] - if (col %in% names(df)) { - response <- trimws(tolower(as.character(df[i, col]))) - correct_answer <- tolower(crt_correct_answers[j]) - - if (!is.na(response) && response != "") { - crt_correct_n <- crt_correct_n + 1 - if (response == correct_answer) { - crt_correct_count <- crt_correct_count + 1 - } - } - } - } - - # Calculate proportion correct - if (crt_correct_n > 0) { - df$crt_correct[i] <- crt_correct_count / crt_correct_n - } - - # CRT Intuitive - crt_int_count <- 0 - crt_int_n <- 0 - - for (j in 1:3) { - col <- crt_cols[j] - if (col %in% names(df)) { - response <- trimws(tolower(as.character(df[i, col]))) - intuitive_answer <- tolower(crt_intuitive_answers[j]) - - if (!is.na(response) && response != "") { - crt_int_n <- crt_int_n + 1 - if (response == intuitive_answer) { - crt_int_count <- crt_int_count + 1 - } - } - } - } - - # Calculate proportion intuitive - if (crt_int_n > 0) { - df$crt_int[i] <- crt_int_count / crt_int_n - } -} - -cat(" CRT correct and intuitive scores calculated.\n\n") - -cat("=== PROCESSING COMPLETE ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # AOT Check - cat("--- AOT SCALE ---\n") - cat("Source values (original in CSV):\n") - aot_original <- numeric(8) - aot_for_calc <- numeric(8) - - for (i in 1:8) { - col <- aot_cols[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - aot_original[i] <- val - - # Apply reversal for items 4-7 - if (i %in% 4:7) { - aot_for_calc[i] <- val * -1 - cat(sprintf(" %s: %s (reversed to %s for calculation)\n", - col, - ifelse(is.na(val), "NA", as.character(val)), - ifelse(is.na(val), "NA", as.character(val * -1)))) - } else { - aot_for_calc[i] <- val - cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", as.character(val)))) - } - } - - # Manual calculation check - valid_aot <- aot_for_calc[!is.na(aot_for_calc)] - if (length(valid_aot) > 0) { - expected_mean <- mean(valid_aot) - actual_value <- df$aot_total[random_row] - cat(sprintf("\nCalculation check:\n")) - cat(sprintf(" Sum of reversed values: %s\n", paste(valid_aot, collapse = " + "))) - cat(sprintf(" Average of %d valid items: %.5f\n", length(valid_aot), expected_mean)) - cat(sprintf(" Target value (aot_total): %.5f\n", actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat("\n No valid AOT values to calculate.\n") - } - - # CRT Check - cat("\n--- CRT SCALE ---\n") - cat("Source values:\n") - crt_correct_count <- 0 - crt_int_count <- 0 - crt_n <- 0 - - for (i in 1:3) { - col <- crt_cols[i] - val <- if (col %in% names(df)) as.character(df[random_row, col]) else "" - val_trimmed <- trimws(tolower(val)) - - correct_ans <- crt_correct_answers[i] - intuitive_ans <- crt_intuitive_answers[i] - - is_correct <- val_trimmed == tolower(correct_ans) - is_intuitive <- val_trimmed == tolower(intuitive_ans) - - if (val_trimmed != "" && !is.na(val_trimmed)) { - crt_n <- crt_n + 1 - if (is_correct) crt_correct_count <- crt_correct_count + 1 - if (is_intuitive) crt_int_count <- crt_int_count + 1 - } - - cat(sprintf(" %s: '%s'\n", col, val)) - cat(sprintf(" Correct answer: '%s' -> %s\n", correct_ans, ifelse(is_correct, "CORRECT ✓", "Not correct"))) - cat(sprintf(" Intuitive answer: '%s' -> %s\n", intuitive_ans, ifelse(is_intuitive, "INTUITIVE ✓", "Not intuitive"))) - } - - cat("\nCalculation check:\n") - if (crt_n > 0) { - expected_correct <- crt_correct_count / crt_n - expected_int <- crt_int_count / crt_n - actual_correct <- df$crt_correct[random_row] - actual_int <- df$crt_int[random_row] - - cat(sprintf(" Correct: %d out of %d = %.5f\n", crt_correct_count, crt_n, expected_correct)) - cat(sprintf(" Target value (crt_correct): %.5f\n", actual_correct)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_correct - actual_correct) < 0.0001, "YES ✓", "NO ✗"))) - - cat(sprintf("\n Intuitive: %d out of %d = %.5f\n", crt_int_count, crt_n, expected_int)) - cat(sprintf(" Target value (crt_int): %.5f\n", actual_int)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_int - actual_int) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid CRT responses to calculate.\n") - } - - cat("\n========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 253 to save changes.\n") -cat("\nProcessing complete! AOT and CRT scales calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode scales VARS_20251001120204.r b/.history/eohi2/dataP - recode scales VARS_20251001120204.r deleted file mode 100644 index 2861737..0000000 --- a/.history/eohi2/dataP - recode scales VARS_20251001120204.r +++ /dev/null @@ -1,298 +0,0 @@ -# Script to compute AOT and CRT scales in eohi2.csv -# AOT: Reverse codes items 4-7, then averages all 8 items -# CRT: Calculates proportion of correct and intuitive responses - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source columns -aot_cols <- c("aot_1", "aot_2", "aot_3", "aot_4", "aot_5", "aot_6", "aot_7", "aot_8") -crt_cols <- c("crt_1", "crt_2", "crt_3") - -# Define target columns -target_cols <- c("aot_total", "crt_correct", "crt_int") - -# Define correct and intuitive CRT answers -crt_correct_answers <- c("5 cents", "5 minutes", "47 days") -crt_intuitive_answers <- c("10 cents", "100 minutes", "24 days") - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check AOT columns -missing_aot <- aot_cols[!aot_cols %in% df_cols] -existing_aot <- aot_cols[aot_cols %in% df_cols] - -cat("AOT Source Columns:\n") -cat(" Expected: 8 columns\n") -cat(" Found:", length(existing_aot), "columns\n") -cat(" Missing:", length(missing_aot), "columns\n") - -if (length(missing_aot) > 0) { - cat("\n Missing AOT columns:\n") - for (col in missing_aot) { - cat(" -", col, "\n") - } -} - -# Check CRT columns -missing_crt <- crt_cols[!crt_cols %in% df_cols] -existing_crt <- crt_cols[crt_cols %in% df_cols] - -cat("\nCRT Source Columns:\n") -cat(" Expected: 3 columns\n") -cat(" Found:", length(existing_crt), "columns\n") -cat(" Missing:", length(missing_crt), "columns\n") - -if (length(missing_crt) > 0) { - cat("\n Missing CRT columns:\n") - for (col in missing_crt) { - cat(" -", col, "\n") - } -} - -# Check target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 3 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Missing target columns:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_aot) > 4 || length(missing_crt) > 1 || length(missing_targets) > 1) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= PROCESS AOT SCALE ============= -cat("Processing AOT scale...\n") - -# Convert AOT columns to numeric (handling any non-numeric values) -for (col in aot_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Calculate average with reverse coding (WITHOUT modifying original values) -# Items 4, 5, 6, 7 are reverse coded for calculation only -df$aot_total <- apply(df[, aot_cols[aot_cols %in% names(df)], drop = FALSE], 1, function(row) { - # Create a copy for calculation - values <- as.numeric(row) - - # Reverse items 4, 5, 6, 7 (positions in aot_cols vector) - reverse_positions <- c(4, 5, 6, 7) - values[reverse_positions] <- values[reverse_positions] * -1 - - # Return mean (na.rm = TRUE handles missing values) - mean(values, na.rm = TRUE) -}) - -cat(" AOT total scores calculated (items 4-7 reverse coded for calculation only).\n") -cat(" Original AOT item values preserved in dataframe.\n\n") - -# ============= PROCESS CRT SCALES ============= -cat("Processing CRT scales...\n") - -# Initialize CRT columns -df$crt_correct <- NA -df$crt_int <- NA - -# Process each row -for (i in 1:nrow(df)) { - # CRT Correct - crt_correct_count <- 0 - crt_correct_n <- 0 - - for (j in 1:3) { - col <- crt_cols[j] - if (col %in% names(df)) { - response <- trimws(tolower(as.character(df[i, col]))) - correct_answer <- tolower(crt_correct_answers[j]) - - if (!is.na(response) && response != "") { - crt_correct_n <- crt_correct_n + 1 - if (response == correct_answer) { - crt_correct_count <- crt_correct_count + 1 - } - } - } - } - - # Calculate proportion correct - if (crt_correct_n > 0) { - df$crt_correct[i] <- crt_correct_count / crt_correct_n - } - - # CRT Intuitive - crt_int_count <- 0 - crt_int_n <- 0 - - for (j in 1:3) { - col <- crt_cols[j] - if (col %in% names(df)) { - response <- trimws(tolower(as.character(df[i, col]))) - intuitive_answer <- tolower(crt_intuitive_answers[j]) - - if (!is.na(response) && response != "") { - crt_int_n <- crt_int_n + 1 - if (response == intuitive_answer) { - crt_int_count <- crt_int_count + 1 - } - } - } - } - - # Calculate proportion intuitive - if (crt_int_n > 0) { - df$crt_int[i] <- crt_int_count / crt_int_n - } -} - -cat(" CRT correct and intuitive scores calculated.\n\n") - -cat("=== PROCESSING COMPLETE ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # AOT Check - cat("--- AOT SCALE ---\n") - cat("Source values (original in CSV):\n") - aot_original <- numeric(8) - aot_for_calc <- numeric(8) - - for (i in 1:8) { - col <- aot_cols[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - aot_original[i] <- val - - # Apply reversal for items 4-7 - if (i %in% 4:7) { - aot_for_calc[i] <- val * -1 - cat(sprintf(" %s: %s (reversed to %s for calculation)\n", - col, - ifelse(is.na(val), "NA", as.character(val)), - ifelse(is.na(val), "NA", as.character(val * -1)))) - } else { - aot_for_calc[i] <- val - cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", as.character(val)))) - } - } - - # Manual calculation check - valid_aot <- aot_for_calc[!is.na(aot_for_calc)] - if (length(valid_aot) > 0) { - expected_mean <- mean(valid_aot) - actual_value <- df$aot_total[random_row] - cat(sprintf("\nCalculation check:\n")) - cat(sprintf(" Sum of reversed values: %s\n", paste(valid_aot, collapse = " + "))) - cat(sprintf(" Average of %d valid items: %.5f\n", length(valid_aot), expected_mean)) - cat(sprintf(" Target value (aot_total): %.5f\n", actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat("\n No valid AOT values to calculate.\n") - } - - # CRT Check - cat("\n--- CRT SCALE ---\n") - cat("Source values:\n") - crt_correct_count <- 0 - crt_int_count <- 0 - crt_n <- 0 - - for (i in 1:3) { - col <- crt_cols[i] - val <- if (col %in% names(df)) as.character(df[random_row, col]) else "" - val_trimmed <- trimws(tolower(val)) - - correct_ans <- crt_correct_answers[i] - intuitive_ans <- crt_intuitive_answers[i] - - is_correct <- val_trimmed == tolower(correct_ans) - is_intuitive <- val_trimmed == tolower(intuitive_ans) - - if (val_trimmed != "" && !is.na(val_trimmed)) { - crt_n <- crt_n + 1 - if (is_correct) crt_correct_count <- crt_correct_count + 1 - if (is_intuitive) crt_int_count <- crt_int_count + 1 - } - - cat(sprintf(" %s: '%s'\n", col, val)) - cat(sprintf(" Correct answer: '%s' -> %s\n", correct_ans, ifelse(is_correct, "CORRECT ✓", "Not correct"))) - cat(sprintf(" Intuitive answer: '%s' -> %s\n", intuitive_ans, ifelse(is_intuitive, "INTUITIVE ✓", "Not intuitive"))) - } - - cat("\nCalculation check:\n") - if (crt_n > 0) { - expected_correct <- crt_correct_count / crt_n - expected_int <- crt_int_count / crt_n - actual_correct <- df$crt_correct[random_row] - actual_int <- df$crt_int[random_row] - - cat(sprintf(" Correct: %d out of %d = %.5f\n", crt_correct_count, crt_n, expected_correct)) - cat(sprintf(" Target value (crt_correct): %.5f\n", actual_correct)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_correct - actual_correct) < 0.0001, "YES ✓", "NO ✗"))) - - cat(sprintf("\n Intuitive: %d out of %d = %.5f\n", crt_int_count, crt_n, expected_int)) - cat(sprintf(" Target value (crt_int): %.5f\n", actual_int)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_int - actual_int) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid CRT responses to calculate.\n") - } - - cat("\n========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 253 to save changes.\n") -cat("\nProcessing complete! AOT and CRT scales calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode scales VARS_20251001120217.r b/.history/eohi2/dataP - recode scales VARS_20251001120217.r deleted file mode 100644 index 2861737..0000000 --- a/.history/eohi2/dataP - recode scales VARS_20251001120217.r +++ /dev/null @@ -1,298 +0,0 @@ -# Script to compute AOT and CRT scales in eohi2.csv -# AOT: Reverse codes items 4-7, then averages all 8 items -# CRT: Calculates proportion of correct and intuitive responses - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source columns -aot_cols <- c("aot_1", "aot_2", "aot_3", "aot_4", "aot_5", "aot_6", "aot_7", "aot_8") -crt_cols <- c("crt_1", "crt_2", "crt_3") - -# Define target columns -target_cols <- c("aot_total", "crt_correct", "crt_int") - -# Define correct and intuitive CRT answers -crt_correct_answers <- c("5 cents", "5 minutes", "47 days") -crt_intuitive_answers <- c("10 cents", "100 minutes", "24 days") - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check AOT columns -missing_aot <- aot_cols[!aot_cols %in% df_cols] -existing_aot <- aot_cols[aot_cols %in% df_cols] - -cat("AOT Source Columns:\n") -cat(" Expected: 8 columns\n") -cat(" Found:", length(existing_aot), "columns\n") -cat(" Missing:", length(missing_aot), "columns\n") - -if (length(missing_aot) > 0) { - cat("\n Missing AOT columns:\n") - for (col in missing_aot) { - cat(" -", col, "\n") - } -} - -# Check CRT columns -missing_crt <- crt_cols[!crt_cols %in% df_cols] -existing_crt <- crt_cols[crt_cols %in% df_cols] - -cat("\nCRT Source Columns:\n") -cat(" Expected: 3 columns\n") -cat(" Found:", length(existing_crt), "columns\n") -cat(" Missing:", length(missing_crt), "columns\n") - -if (length(missing_crt) > 0) { - cat("\n Missing CRT columns:\n") - for (col in missing_crt) { - cat(" -", col, "\n") - } -} - -# Check target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 3 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Missing target columns:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_aot) > 4 || length(missing_crt) > 1 || length(missing_targets) > 1) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= PROCESS AOT SCALE ============= -cat("Processing AOT scale...\n") - -# Convert AOT columns to numeric (handling any non-numeric values) -for (col in aot_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Calculate average with reverse coding (WITHOUT modifying original values) -# Items 4, 5, 6, 7 are reverse coded for calculation only -df$aot_total <- apply(df[, aot_cols[aot_cols %in% names(df)], drop = FALSE], 1, function(row) { - # Create a copy for calculation - values <- as.numeric(row) - - # Reverse items 4, 5, 6, 7 (positions in aot_cols vector) - reverse_positions <- c(4, 5, 6, 7) - values[reverse_positions] <- values[reverse_positions] * -1 - - # Return mean (na.rm = TRUE handles missing values) - mean(values, na.rm = TRUE) -}) - -cat(" AOT total scores calculated (items 4-7 reverse coded for calculation only).\n") -cat(" Original AOT item values preserved in dataframe.\n\n") - -# ============= PROCESS CRT SCALES ============= -cat("Processing CRT scales...\n") - -# Initialize CRT columns -df$crt_correct <- NA -df$crt_int <- NA - -# Process each row -for (i in 1:nrow(df)) { - # CRT Correct - crt_correct_count <- 0 - crt_correct_n <- 0 - - for (j in 1:3) { - col <- crt_cols[j] - if (col %in% names(df)) { - response <- trimws(tolower(as.character(df[i, col]))) - correct_answer <- tolower(crt_correct_answers[j]) - - if (!is.na(response) && response != "") { - crt_correct_n <- crt_correct_n + 1 - if (response == correct_answer) { - crt_correct_count <- crt_correct_count + 1 - } - } - } - } - - # Calculate proportion correct - if (crt_correct_n > 0) { - df$crt_correct[i] <- crt_correct_count / crt_correct_n - } - - # CRT Intuitive - crt_int_count <- 0 - crt_int_n <- 0 - - for (j in 1:3) { - col <- crt_cols[j] - if (col %in% names(df)) { - response <- trimws(tolower(as.character(df[i, col]))) - intuitive_answer <- tolower(crt_intuitive_answers[j]) - - if (!is.na(response) && response != "") { - crt_int_n <- crt_int_n + 1 - if (response == intuitive_answer) { - crt_int_count <- crt_int_count + 1 - } - } - } - } - - # Calculate proportion intuitive - if (crt_int_n > 0) { - df$crt_int[i] <- crt_int_count / crt_int_n - } -} - -cat(" CRT correct and intuitive scores calculated.\n\n") - -cat("=== PROCESSING COMPLETE ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # AOT Check - cat("--- AOT SCALE ---\n") - cat("Source values (original in CSV):\n") - aot_original <- numeric(8) - aot_for_calc <- numeric(8) - - for (i in 1:8) { - col <- aot_cols[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - aot_original[i] <- val - - # Apply reversal for items 4-7 - if (i %in% 4:7) { - aot_for_calc[i] <- val * -1 - cat(sprintf(" %s: %s (reversed to %s for calculation)\n", - col, - ifelse(is.na(val), "NA", as.character(val)), - ifelse(is.na(val), "NA", as.character(val * -1)))) - } else { - aot_for_calc[i] <- val - cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", as.character(val)))) - } - } - - # Manual calculation check - valid_aot <- aot_for_calc[!is.na(aot_for_calc)] - if (length(valid_aot) > 0) { - expected_mean <- mean(valid_aot) - actual_value <- df$aot_total[random_row] - cat(sprintf("\nCalculation check:\n")) - cat(sprintf(" Sum of reversed values: %s\n", paste(valid_aot, collapse = " + "))) - cat(sprintf(" Average of %d valid items: %.5f\n", length(valid_aot), expected_mean)) - cat(sprintf(" Target value (aot_total): %.5f\n", actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat("\n No valid AOT values to calculate.\n") - } - - # CRT Check - cat("\n--- CRT SCALE ---\n") - cat("Source values:\n") - crt_correct_count <- 0 - crt_int_count <- 0 - crt_n <- 0 - - for (i in 1:3) { - col <- crt_cols[i] - val <- if (col %in% names(df)) as.character(df[random_row, col]) else "" - val_trimmed <- trimws(tolower(val)) - - correct_ans <- crt_correct_answers[i] - intuitive_ans <- crt_intuitive_answers[i] - - is_correct <- val_trimmed == tolower(correct_ans) - is_intuitive <- val_trimmed == tolower(intuitive_ans) - - if (val_trimmed != "" && !is.na(val_trimmed)) { - crt_n <- crt_n + 1 - if (is_correct) crt_correct_count <- crt_correct_count + 1 - if (is_intuitive) crt_int_count <- crt_int_count + 1 - } - - cat(sprintf(" %s: '%s'\n", col, val)) - cat(sprintf(" Correct answer: '%s' -> %s\n", correct_ans, ifelse(is_correct, "CORRECT ✓", "Not correct"))) - cat(sprintf(" Intuitive answer: '%s' -> %s\n", intuitive_ans, ifelse(is_intuitive, "INTUITIVE ✓", "Not intuitive"))) - } - - cat("\nCalculation check:\n") - if (crt_n > 0) { - expected_correct <- crt_correct_count / crt_n - expected_int <- crt_int_count / crt_n - actual_correct <- df$crt_correct[random_row] - actual_int <- df$crt_int[random_row] - - cat(sprintf(" Correct: %d out of %d = %.5f\n", crt_correct_count, crt_n, expected_correct)) - cat(sprintf(" Target value (crt_correct): %.5f\n", actual_correct)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_correct - actual_correct) < 0.0001, "YES ✓", "NO ✗"))) - - cat(sprintf("\n Intuitive: %d out of %d = %.5f\n", crt_int_count, crt_n, expected_int)) - cat(sprintf(" Target value (crt_int): %.5f\n", actual_int)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_int - actual_int) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid CRT responses to calculate.\n") - } - - cat("\n========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 253 to save changes.\n") -cat("\nProcessing complete! AOT and CRT scales calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode scales VARS_20251001120228.r b/.history/eohi2/dataP - recode scales VARS_20251001120228.r deleted file mode 100644 index 3eb817b..0000000 --- a/.history/eohi2/dataP - recode scales VARS_20251001120228.r +++ /dev/null @@ -1,298 +0,0 @@ -# Script to compute AOT and CRT scales in eohi2.csv -# AOT: Reverse codes items 4-7, then averages all 8 items -# CRT: Calculates proportion of correct and intuitive responses - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source columns -aot_cols <- c("aot_1", "aot_2", "aot_3", "aot_4", "aot_5", "aot_6", "aot_7", "aot_8") -crt_cols <- c("crt_1", "crt_2", "crt_3") - -# Define target columns -target_cols <- c("aot_total", "crt_correct", "crt_int") - -# Define correct and intuitive CRT answers -crt_correct_answers <- c("5 cents", "5 minutes", "47 days") -crt_intuitive_answers <- c("10 cents", "100 minutes", "24 days") - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check AOT columns -missing_aot <- aot_cols[!aot_cols %in% df_cols] -existing_aot <- aot_cols[aot_cols %in% df_cols] - -cat("AOT Source Columns:\n") -cat(" Expected: 8 columns\n") -cat(" Found:", length(existing_aot), "columns\n") -cat(" Missing:", length(missing_aot), "columns\n") - -if (length(missing_aot) > 0) { - cat("\n Missing AOT columns:\n") - for (col in missing_aot) { - cat(" -", col, "\n") - } -} - -# Check CRT columns -missing_crt <- crt_cols[!crt_cols %in% df_cols] -existing_crt <- crt_cols[crt_cols %in% df_cols] - -cat("\nCRT Source Columns:\n") -cat(" Expected: 3 columns\n") -cat(" Found:", length(existing_crt), "columns\n") -cat(" Missing:", length(missing_crt), "columns\n") - -if (length(missing_crt) > 0) { - cat("\n Missing CRT columns:\n") - for (col in missing_crt) { - cat(" -", col, "\n") - } -} - -# Check target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 3 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Missing target columns:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_aot) > 4 || length(missing_crt) > 1 || length(missing_targets) > 1) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= PROCESS AOT SCALE ============= -cat("Processing AOT scale...\n") - -# Convert AOT columns to numeric (handling any non-numeric values) -for (col in aot_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Calculate average with reverse coding (WITHOUT modifying original values) -# Items 4, 5, 6, 7 are reverse coded for calculation only -df$aot_total <- apply(df[, aot_cols[aot_cols %in% names(df)], drop = FALSE], 1, function(row) { - # Create a copy for calculation - values <- as.numeric(row) - - # Reverse items 4, 5, 6, 7 (positions in aot_cols vector) - reverse_positions <- c(4, 5, 6, 7) - values[reverse_positions] <- values[reverse_positions] * -1 - - # Return mean (na.rm = TRUE handles missing values) - mean(values, na.rm = TRUE) -}) - -cat(" AOT total scores calculated (items 4-7 reverse coded for calculation only).\n") -cat(" Original AOT item values preserved in dataframe.\n\n") - -# ============= PROCESS CRT SCALES ============= -cat("Processing CRT scales...\n") - -# Initialize CRT columns -df$crt_correct <- NA -df$crt_int <- NA - -# Process each row -for (i in 1:nrow(df)) { - # CRT Correct - crt_correct_count <- 0 - crt_correct_n <- 0 - - for (j in 1:3) { - col <- crt_cols[j] - if (col %in% names(df)) { - response <- trimws(tolower(as.character(df[i, col]))) - correct_answer <- tolower(crt_correct_answers[j]) - - if (!is.na(response) && response != "") { - crt_correct_n <- crt_correct_n + 1 - if (response == correct_answer) { - crt_correct_count <- crt_correct_count + 1 - } - } - } - } - - # Calculate proportion correct - if (crt_correct_n > 0) { - df$crt_correct[i] <- crt_correct_count / crt_correct_n - } - - # CRT Intuitive - crt_int_count <- 0 - crt_int_n <- 0 - - for (j in 1:3) { - col <- crt_cols[j] - if (col %in% names(df)) { - response <- trimws(tolower(as.character(df[i, col]))) - intuitive_answer <- tolower(crt_intuitive_answers[j]) - - if (!is.na(response) && response != "") { - crt_int_n <- crt_int_n + 1 - if (response == intuitive_answer) { - crt_int_count <- crt_int_count + 1 - } - } - } - } - - # Calculate proportion intuitive - if (crt_int_n > 0) { - df$crt_int[i] <- crt_int_count / crt_int_n - } -} - -cat(" CRT correct and intuitive scores calculated.\n\n") - -cat("=== PROCESSING COMPLETE ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # AOT Check - cat("--- AOT SCALE ---\n") - cat("Source values (original in CSV):\n") - aot_original <- numeric(8) - aot_for_calc <- numeric(8) - - for (i in 1:8) { - col <- aot_cols[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - aot_original[i] <- val - - # Apply reversal for items 4-7 - if (i %in% 4:7) { - aot_for_calc[i] <- val * -1 - cat(sprintf(" %s: %s (reversed to %s for calculation)\n", - col, - ifelse(is.na(val), "NA", as.character(val)), - ifelse(is.na(val), "NA", as.character(val * -1)))) - } else { - aot_for_calc[i] <- val - cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", as.character(val)))) - } - } - - # Manual calculation check - valid_aot <- aot_for_calc[!is.na(aot_for_calc)] - if (length(valid_aot) > 0) { - expected_mean <- mean(valid_aot) - actual_value <- df$aot_total[random_row] - cat(sprintf("\nCalculation check:\n")) - cat(sprintf(" Sum of reversed values: %s\n", paste(valid_aot, collapse = " + "))) - cat(sprintf(" Average of %d valid items: %.5f\n", length(valid_aot), expected_mean)) - cat(sprintf(" Target value (aot_total): %.5f\n", actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat("\n No valid AOT values to calculate.\n") - } - - # CRT Check - cat("\n--- CRT SCALE ---\n") - cat("Source values:\n") - crt_correct_count <- 0 - crt_int_count <- 0 - crt_n <- 0 - - for (i in 1:3) { - col <- crt_cols[i] - val <- if (col %in% names(df)) as.character(df[random_row, col]) else "" - val_trimmed <- trimws(tolower(val)) - - correct_ans <- crt_correct_answers[i] - intuitive_ans <- crt_intuitive_answers[i] - - is_correct <- val_trimmed == tolower(correct_ans) - is_intuitive <- val_trimmed == tolower(intuitive_ans) - - if (val_trimmed != "" && !is.na(val_trimmed)) { - crt_n <- crt_n + 1 - if (is_correct) crt_correct_count <- crt_correct_count + 1 - if (is_intuitive) crt_int_count <- crt_int_count + 1 - } - - cat(sprintf(" %s: '%s'\n", col, val)) - cat(sprintf(" Correct answer: '%s' -> %s\n", correct_ans, ifelse(is_correct, "CORRECT ✓", "Not correct"))) - cat(sprintf(" Intuitive answer: '%s' -> %s\n", intuitive_ans, ifelse(is_intuitive, "INTUITIVE ✓", "Not intuitive"))) - } - - cat("\nCalculation check:\n") - if (crt_n > 0) { - expected_correct <- crt_correct_count / crt_n - expected_int <- crt_int_count / crt_n - actual_correct <- df$crt_correct[random_row] - actual_int <- df$crt_int[random_row] - - cat(sprintf(" Correct: %d out of %d = %.5f\n", crt_correct_count, crt_n, expected_correct)) - cat(sprintf(" Target value (crt_correct): %.5f\n", actual_correct)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_correct - actual_correct) < 0.0001, "YES ✓", "NO ✗"))) - - cat(sprintf("\n Intuitive: %d out of %d = %.5f\n", crt_int_count, crt_n, expected_int)) - cat(sprintf(" Target value (crt_int): %.5f\n", actual_int)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_int - actual_int) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid CRT responses to calculate.\n") - } - - cat("\n========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 253 to save changes.\n") -cat("\nProcessing complete! AOT and CRT scales calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - recode scales VARS_20251001121501.r b/.history/eohi2/dataP - recode scales VARS_20251001121501.r deleted file mode 100644 index 2861737..0000000 --- a/.history/eohi2/dataP - recode scales VARS_20251001121501.r +++ /dev/null @@ -1,298 +0,0 @@ -# Script to compute AOT and CRT scales in eohi2.csv -# AOT: Reverse codes items 4-7, then averages all 8 items -# CRT: Calculates proportion of correct and intuitive responses - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define source columns -aot_cols <- c("aot_1", "aot_2", "aot_3", "aot_4", "aot_5", "aot_6", "aot_7", "aot_8") -crt_cols <- c("crt_1", "crt_2", "crt_3") - -# Define target columns -target_cols <- c("aot_total", "crt_correct", "crt_int") - -# Define correct and intuitive CRT answers -crt_correct_answers <- c("5 cents", "5 minutes", "47 days") -crt_intuitive_answers <- c("10 cents", "100 minutes", "24 days") - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check AOT columns -missing_aot <- aot_cols[!aot_cols %in% df_cols] -existing_aot <- aot_cols[aot_cols %in% df_cols] - -cat("AOT Source Columns:\n") -cat(" Expected: 8 columns\n") -cat(" Found:", length(existing_aot), "columns\n") -cat(" Missing:", length(missing_aot), "columns\n") - -if (length(missing_aot) > 0) { - cat("\n Missing AOT columns:\n") - for (col in missing_aot) { - cat(" -", col, "\n") - } -} - -# Check CRT columns -missing_crt <- crt_cols[!crt_cols %in% df_cols] -existing_crt <- crt_cols[crt_cols %in% df_cols] - -cat("\nCRT Source Columns:\n") -cat(" Expected: 3 columns\n") -cat(" Found:", length(existing_crt), "columns\n") -cat(" Missing:", length(missing_crt), "columns\n") - -if (length(missing_crt) > 0) { - cat("\n Missing CRT columns:\n") - for (col in missing_crt) { - cat(" -", col, "\n") - } -} - -# Check target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 3 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Missing target columns:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_aot) > 4 || length(missing_crt) > 1 || length(missing_targets) > 1) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= PROCESS AOT SCALE ============= -cat("Processing AOT scale...\n") - -# Convert AOT columns to numeric (handling any non-numeric values) -for (col in aot_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Calculate average with reverse coding (WITHOUT modifying original values) -# Items 4, 5, 6, 7 are reverse coded for calculation only -df$aot_total <- apply(df[, aot_cols[aot_cols %in% names(df)], drop = FALSE], 1, function(row) { - # Create a copy for calculation - values <- as.numeric(row) - - # Reverse items 4, 5, 6, 7 (positions in aot_cols vector) - reverse_positions <- c(4, 5, 6, 7) - values[reverse_positions] <- values[reverse_positions] * -1 - - # Return mean (na.rm = TRUE handles missing values) - mean(values, na.rm = TRUE) -}) - -cat(" AOT total scores calculated (items 4-7 reverse coded for calculation only).\n") -cat(" Original AOT item values preserved in dataframe.\n\n") - -# ============= PROCESS CRT SCALES ============= -cat("Processing CRT scales...\n") - -# Initialize CRT columns -df$crt_correct <- NA -df$crt_int <- NA - -# Process each row -for (i in 1:nrow(df)) { - # CRT Correct - crt_correct_count <- 0 - crt_correct_n <- 0 - - for (j in 1:3) { - col <- crt_cols[j] - if (col %in% names(df)) { - response <- trimws(tolower(as.character(df[i, col]))) - correct_answer <- tolower(crt_correct_answers[j]) - - if (!is.na(response) && response != "") { - crt_correct_n <- crt_correct_n + 1 - if (response == correct_answer) { - crt_correct_count <- crt_correct_count + 1 - } - } - } - } - - # Calculate proportion correct - if (crt_correct_n > 0) { - df$crt_correct[i] <- crt_correct_count / crt_correct_n - } - - # CRT Intuitive - crt_int_count <- 0 - crt_int_n <- 0 - - for (j in 1:3) { - col <- crt_cols[j] - if (col %in% names(df)) { - response <- trimws(tolower(as.character(df[i, col]))) - intuitive_answer <- tolower(crt_intuitive_answers[j]) - - if (!is.na(response) && response != "") { - crt_int_n <- crt_int_n + 1 - if (response == intuitive_answer) { - crt_int_count <- crt_int_count + 1 - } - } - } - } - - # Calculate proportion intuitive - if (crt_int_n > 0) { - df$crt_int[i] <- crt_int_count / crt_int_n - } -} - -cat(" CRT correct and intuitive scores calculated.\n\n") - -cat("=== PROCESSING COMPLETE ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # AOT Check - cat("--- AOT SCALE ---\n") - cat("Source values (original in CSV):\n") - aot_original <- numeric(8) - aot_for_calc <- numeric(8) - - for (i in 1:8) { - col <- aot_cols[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - aot_original[i] <- val - - # Apply reversal for items 4-7 - if (i %in% 4:7) { - aot_for_calc[i] <- val * -1 - cat(sprintf(" %s: %s (reversed to %s for calculation)\n", - col, - ifelse(is.na(val), "NA", as.character(val)), - ifelse(is.na(val), "NA", as.character(val * -1)))) - } else { - aot_for_calc[i] <- val - cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", as.character(val)))) - } - } - - # Manual calculation check - valid_aot <- aot_for_calc[!is.na(aot_for_calc)] - if (length(valid_aot) > 0) { - expected_mean <- mean(valid_aot) - actual_value <- df$aot_total[random_row] - cat(sprintf("\nCalculation check:\n")) - cat(sprintf(" Sum of reversed values: %s\n", paste(valid_aot, collapse = " + "))) - cat(sprintf(" Average of %d valid items: %.5f\n", length(valid_aot), expected_mean)) - cat(sprintf(" Target value (aot_total): %.5f\n", actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat("\n No valid AOT values to calculate.\n") - } - - # CRT Check - cat("\n--- CRT SCALE ---\n") - cat("Source values:\n") - crt_correct_count <- 0 - crt_int_count <- 0 - crt_n <- 0 - - for (i in 1:3) { - col <- crt_cols[i] - val <- if (col %in% names(df)) as.character(df[random_row, col]) else "" - val_trimmed <- trimws(tolower(val)) - - correct_ans <- crt_correct_answers[i] - intuitive_ans <- crt_intuitive_answers[i] - - is_correct <- val_trimmed == tolower(correct_ans) - is_intuitive <- val_trimmed == tolower(intuitive_ans) - - if (val_trimmed != "" && !is.na(val_trimmed)) { - crt_n <- crt_n + 1 - if (is_correct) crt_correct_count <- crt_correct_count + 1 - if (is_intuitive) crt_int_count <- crt_int_count + 1 - } - - cat(sprintf(" %s: '%s'\n", col, val)) - cat(sprintf(" Correct answer: '%s' -> %s\n", correct_ans, ifelse(is_correct, "CORRECT ✓", "Not correct"))) - cat(sprintf(" Intuitive answer: '%s' -> %s\n", intuitive_ans, ifelse(is_intuitive, "INTUITIVE ✓", "Not intuitive"))) - } - - cat("\nCalculation check:\n") - if (crt_n > 0) { - expected_correct <- crt_correct_count / crt_n - expected_int <- crt_int_count / crt_n - actual_correct <- df$crt_correct[random_row] - actual_int <- df$crt_int[random_row] - - cat(sprintf(" Correct: %d out of %d = %.5f\n", crt_correct_count, crt_n, expected_correct)) - cat(sprintf(" Target value (crt_correct): %.5f\n", actual_correct)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_correct - actual_correct) < 0.0001, "YES ✓", "NO ✗"))) - - cat(sprintf("\n Intuitive: %d out of %d = %.5f\n", crt_int_count, crt_n, expected_int)) - cat(sprintf(" Target value (crt_int): %.5f\n", actual_int)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_int - actual_int) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid CRT responses to calculate.\n") - } - - cat("\n========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 253 to save changes.\n") -cat("\nProcessing complete! AOT and CRT scales calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - time interval differences_20251001130451.r b/.history/eohi2/dataP - time interval differences_20251001130451.r deleted file mode 100644 index 0fc36d3..0000000 --- a/.history/eohi2/dataP - time interval differences_20251001130451.r +++ /dev/null @@ -1,250 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Sample one calculation from each type (item 1: pref_read) - test_item_idx <- 1 - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - cat("Checking sample item: pref_read\n\n") - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - time interval differences_20251001130503.r b/.history/eohi2/dataP - time interval differences_20251001130503.r deleted file mode 100644 index 0fc36d3..0000000 --- a/.history/eohi2/dataP - time interval differences_20251001130503.r +++ /dev/null @@ -1,250 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Sample one calculation from each type (item 1: pref_read) - test_item_idx <- 1 - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - cat("Checking sample item: pref_read\n\n") - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - time interval differences_20251001130516.r b/.history/eohi2/dataP - time interval differences_20251001130516.r deleted file mode 100644 index 0fc36d3..0000000 --- a/.history/eohi2/dataP - time interval differences_20251001130516.r +++ /dev/null @@ -1,250 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Sample one calculation from each type (item 1: pref_read) - test_item_idx <- 1 - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - cat("Checking sample item: pref_read\n\n") - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - time interval differences_20251001130613.r b/.history/eohi2/dataP - time interval differences_20251001130613.r deleted file mode 100644 index b359aa9..0000000 --- a/.history/eohi2/dataP - time interval differences_20251001130613.r +++ /dev/null @@ -1,268 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Sample one calculation from each type (item 1: pref_read) - test_item_idx <- 1 - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - cat("Checking sample item: pref_read\n\n") - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - time interval differences_20251001130619.r b/.history/eohi2/dataP - time interval differences_20251001130619.r deleted file mode 100644 index b359aa9..0000000 --- a/.history/eohi2/dataP - time interval differences_20251001130619.r +++ /dev/null @@ -1,268 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Sample one calculation from each type (item 1: pref_read) - test_item_idx <- 1 - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - cat("Checking sample item: pref_read\n\n") - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - time interval differences_20251001130649.r b/.history/eohi2/dataP - time interval differences_20251001130649.r deleted file mode 100644 index b359aa9..0000000 --- a/.history/eohi2/dataP - time interval differences_20251001130649.r +++ /dev/null @@ -1,268 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Sample one calculation from each type (item 1: pref_read) - test_item_idx <- 1 - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - cat("Checking sample item: pref_read\n\n") - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - time interval differences_20251001131102.r b/.history/eohi2/dataP - time interval differences_20251001131102.r deleted file mode 100644 index 6ed17f4..0000000 --- a/.history/eohi2/dataP - time interval differences_20251001131102.r +++ /dev/null @@ -1,278 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function(row_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(1:nrow(df), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - cat("========================================\n\n") - } - - # Sample one calculation from each type (item 1: pref_read) - test_item_idx <- 1 - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - cat("Checking sample item: pref_read\n\n") - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - time interval differences_20251001131108.r b/.history/eohi2/dataP - time interval differences_20251001131108.r deleted file mode 100644 index 9a3c70d..0000000 --- a/.history/eohi2/dataP - time interval differences_20251001131108.r +++ /dev/null @@ -1,280 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function(row_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(1:nrow(df), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - cat("========================================\n\n") - } - - # Sample one calculation from each type (item 1: pref_read) - test_item_idx <- 1 - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - cat("Checking sample item: pref_read\n\n") - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW ***\n") -cat("For random row, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118), run:\n") -cat(" qa_check_random_row(118)\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - time interval differences_20251001131115.r b/.history/eohi2/dataP - time interval differences_20251001131115.r deleted file mode 100644 index 9a3c70d..0000000 --- a/.history/eohi2/dataP - time interval differences_20251001131115.r +++ /dev/null @@ -1,280 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function(row_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(1:nrow(df), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - cat("========================================\n\n") - } - - # Sample one calculation from each type (item 1: pref_read) - test_item_idx <- 1 - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - cat("Checking sample item: pref_read\n\n") - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW ***\n") -cat("For random row, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118), run:\n") -cat(" qa_check_random_row(118)\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - time interval differences_20251001131121.r b/.history/eohi2/dataP - time interval differences_20251001131121.r deleted file mode 100644 index 9a3c70d..0000000 --- a/.history/eohi2/dataP - time interval differences_20251001131121.r +++ /dev/null @@ -1,280 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function(row_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(1:nrow(df), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - cat("========================================\n\n") - } - - # Sample one calculation from each type (item 1: pref_read) - test_item_idx <- 1 - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - cat("Checking sample item: pref_read\n\n") - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW ***\n") -cat("For random row, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118), run:\n") -cat(" qa_check_random_row(118)\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - time interval differences_20251001131239.r b/.history/eohi2/dataP - time interval differences_20251001131239.r deleted file mode 100644 index cc0e2e4..0000000 --- a/.history/eohi2/dataP - time interval differences_20251001131239.r +++ /dev/null @@ -1,280 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function(row_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(1:nrow(df), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - cat("========================================\n\n") - } - - # Sample one calculation from each type (item 1: pref_read) - test_item_idx <- 1 - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - cat("Checking sample item: pref_read\n\n") - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row(8) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW ***\n") -cat("For random row, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118), run:\n") -cat(" qa_check_random_row(118)\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - time interval differences_20251001131304.r b/.history/eohi2/dataP - time interval differences_20251001131304.r deleted file mode 100644 index cc0e2e4..0000000 --- a/.history/eohi2/dataP - time interval differences_20251001131304.r +++ /dev/null @@ -1,280 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function(row_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(1:nrow(df), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - cat("========================================\n\n") - } - - # Sample one calculation from each type (item 1: pref_read) - test_item_idx <- 1 - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - cat("Checking sample item: pref_read\n\n") - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row(8) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW ***\n") -cat("For random row, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118), run:\n") -cat(" qa_check_random_row(118)\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - time interval differences_20251001131331.r b/.history/eohi2/dataP - time interval differences_20251001131331.r deleted file mode 100644 index 303801e..0000000 --- a/.history/eohi2/dataP - time interval differences_20251001131331.r +++ /dev/null @@ -1,280 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function(row_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(1:nrow(df), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - cat("========================================\n\n") - } - - # Sample one calculation from each type (item 1: pref_read) - test_item_idx <- 1 - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - cat("Checking sample item: pref_read\n\n") - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() #leave blank for random row; specify row number for specific row ex (qa_check_random_row(118)) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW ***\n") -cat("For random row, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118), run:\n") -cat(" qa_check_random_row(118)\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP - time interval differences_20251001131423.r b/.history/eohi2/dataP - time interval differences_20251001131423.r deleted file mode 100644 index 05dd5a6..0000000 --- a/.history/eohi2/dataP - time interval differences_20251001131423.r +++ /dev/null @@ -1,280 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function(row_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(1:nrow(df), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - cat("========================================\n\n") - } - - # Sample one calculation from each type (item 1: pref_read) - test_item_idx <- 1 - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - cat("Checking sample item: pref_read\n\n") - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() #leave blank for random row; specify row number for specific row ex (qa_check_random_row(118)) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW ***\n") -cat("For random row, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118), run:\n") -cat(" qa_check_random_row(118)\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP 06 - time interval differences_20251001131422.r b/.history/eohi2/dataP 06 - time interval differences_20251001131422.r deleted file mode 100644 index 05dd5a6..0000000 --- a/.history/eohi2/dataP 06 - time interval differences_20251001131422.r +++ /dev/null @@ -1,280 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function(row_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(1:nrow(df), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - cat("========================================\n\n") - } - - # Sample one calculation from each type (item 1: pref_read) - test_item_idx <- 1 - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - cat("Checking sample item: pref_read\n\n") - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() #leave blank for random row; specify row number for specific row ex (qa_check_random_row(118)) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW ***\n") -cat("For random row, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118), run:\n") -cat(" qa_check_random_row(118)\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP 06 - time interval differences_20251001132410.r b/.history/eohi2/dataP 06 - time interval differences_20251001132410.r deleted file mode 100644 index 05dd5a6..0000000 --- a/.history/eohi2/dataP 06 - time interval differences_20251001132410.r +++ /dev/null @@ -1,280 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function(row_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(1:nrow(df), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - cat("========================================\n\n") - } - - # Sample one calculation from each type (item 1: pref_read) - test_item_idx <- 1 - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - cat("Checking sample item: pref_read\n\n") - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() #leave blank for random row; specify row number for specific row ex (qa_check_random_row(118)) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW ***\n") -cat("For random row, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118), run:\n") -cat(" qa_check_random_row(118)\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP 06 - time interval differences_20251001132459.r b/.history/eohi2/dataP 06 - time interval differences_20251001132459.r deleted file mode 100644 index 7c6be10..0000000 --- a/.history/eohi2/dataP 06 - time interval differences_20251001132459.r +++ /dev/null @@ -1,290 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function(row_num = NULL, item_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(1:nrow(df), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - } - - # Pick a random item or use specified item - if (is.null(item_num)) { - test_item_idx <- sample(1:15, 1) - cat("Random Item #", test_item_idx, ": ", items[test_item_idx], "\n") - } else { - if (item_num < 1 || item_num > 15) { - cat("ERROR: Item number must be between 1 and 15\n") - return() - } - test_item_idx <- item_num - cat("Specified Item #", test_item_idx, ": ", items[test_item_idx], "\n") - } - - cat("========================================\n\n") - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - cat("Checking sample item: pref_read\n\n") - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() #leave blank for random row; specify row number for specific row ex (qa_check_random_row(118)) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW ***\n") -cat("For random row, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118), run:\n") -cat(" qa_check_random_row(118)\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP 06 - time interval differences_20251001132504.r b/.history/eohi2/dataP 06 - time interval differences_20251001132504.r deleted file mode 100644 index cb2b78c..0000000 --- a/.history/eohi2/dataP 06 - time interval differences_20251001132504.r +++ /dev/null @@ -1,288 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function(row_num = NULL, item_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(1:nrow(df), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - } - - # Pick a random item or use specified item - if (is.null(item_num)) { - test_item_idx <- sample(1:15, 1) - cat("Random Item #", test_item_idx, ": ", items[test_item_idx], "\n") - } else { - if (item_num < 1 || item_num > 15) { - cat("ERROR: Item number must be between 1 and 15\n") - return() - } - test_item_idx <- item_num - cat("Specified Item #", test_item_idx, ": ", items[test_item_idx], "\n") - } - - cat("========================================\n\n") - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() #leave blank for random row; specify row number for specific row ex (qa_check_random_row(118)) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW ***\n") -cat("For random row, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118), run:\n") -cat(" qa_check_random_row(118)\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP 06 - time interval differences_20251001132511.r b/.history/eohi2/dataP 06 - time interval differences_20251001132511.r deleted file mode 100644 index 941dc4b..0000000 --- a/.history/eohi2/dataP 06 - time interval differences_20251001132511.r +++ /dev/null @@ -1,292 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function(row_num = NULL, item_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(1:nrow(df), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - } - - # Pick a random item or use specified item - if (is.null(item_num)) { - test_item_idx <- sample(1:15, 1) - cat("Random Item #", test_item_idx, ": ", items[test_item_idx], "\n") - } else { - if (item_num < 1 || item_num > 15) { - cat("ERROR: Item number must be between 1 and 15\n") - return() - } - test_item_idx <- item_num - cat("Specified Item #", test_item_idx, ": ", items[test_item_idx], "\n") - } - - cat("========================================\n\n") - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() #leave blank for random row; specify row number for specific row ex (qa_check_random_row(118)) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW/ITEM ***\n") -cat("For random row AND random item, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118) with random item:\n") -cat(" qa_check_random_row(118)\n") -cat("\nFor random row with specific item (e.g., item 5 = pref_travel):\n") -cat(" qa_check_random_row(item_num = 5)\n") -cat("\nFor specific row AND specific item:\n") -cat(" qa_check_random_row(118, 5)\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP 06 - time interval differences_20251001132516.r b/.history/eohi2/dataP 06 - time interval differences_20251001132516.r deleted file mode 100644 index 5954703..0000000 --- a/.history/eohi2/dataP 06 - time interval differences_20251001132516.r +++ /dev/null @@ -1,292 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function(row_num = NULL, item_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(1:nrow(df), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - } - - # Pick a random item or use specified item - if (is.null(item_num)) { - test_item_idx <- sample(1:15, 1) - cat("Random Item #", test_item_idx, ": ", items[test_item_idx], "\n") - } else { - if (item_num < 1 || item_num > 15) { - cat("ERROR: Item number must be between 1 and 15\n") - return() - } - test_item_idx <- item_num - cat("Specified Item #", test_item_idx, ": ", items[test_item_idx], "\n") - } - - cat("========================================\n\n") - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on random row and random item -cat("\n\n") -qa_check_random_row() # Leave blank for random row & item; specify parameters as needed (see examples below) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW/ITEM ***\n") -cat("For random row AND random item, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118) with random item:\n") -cat(" qa_check_random_row(118)\n") -cat("\nFor random row with specific item (e.g., item 5 = pref_travel):\n") -cat(" qa_check_random_row(item_num = 5)\n") -cat("\nFor specific row AND specific item:\n") -cat(" qa_check_random_row(118, 5)\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP 06 - time interval differences_20251001132521.r b/.history/eohi2/dataP 06 - time interval differences_20251001132521.r deleted file mode 100644 index 4e7649e..0000000 --- a/.history/eohi2/dataP 06 - time interval differences_20251001132521.r +++ /dev/null @@ -1,292 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW & ITEM CHECK ============= -# This function can be run multiple times to check different random rows and items - -qa_check_random_row <- function(row_num = NULL, item_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(1:nrow(df), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - } - - # Pick a random item or use specified item - if (is.null(item_num)) { - test_item_idx <- sample(1:15, 1) - cat("Random Item #", test_item_idx, ": ", items[test_item_idx], "\n") - } else { - if (item_num < 1 || item_num > 15) { - cat("ERROR: Item number must be between 1 and 15\n") - return() - } - test_item_idx <- item_num - cat("Specified Item #", test_item_idx, ": ", items[test_item_idx], "\n") - } - - cat("========================================\n\n") - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on random row and random item -cat("\n\n") -qa_check_random_row() # Leave blank for random row & item; specify parameters as needed (see examples below) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW/ITEM ***\n") -cat("For random row AND random item, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118) with random item:\n") -cat(" qa_check_random_row(118)\n") -cat("\nFor random row with specific item (e.g., item 5 = pref_travel):\n") -cat(" qa_check_random_row(item_num = 5)\n") -cat("\nFor specific row AND specific item:\n") -cat(" qa_check_random_row(118, 5)\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP 06 - time interval differences_20251001132530.r b/.history/eohi2/dataP 06 - time interval differences_20251001132530.r deleted file mode 100644 index a4565c2..0000000 --- a/.history/eohi2/dataP 06 - time interval differences_20251001132530.r +++ /dev/null @@ -1,292 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW & ITEM CHECK ============= -# This function can be run multiple times to check different random rows and items - -qa_check_random_row <- function(row_num = NULL, item_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(seq_len(nrow(df)), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - } - - # Pick a random item or use specified item - if (is.null(item_num)) { - test_item_idx <- sample(1:15, 1) - cat("Random Item #", test_item_idx, ": ", items[test_item_idx], "\n") - } else { - if (item_num < 1 || item_num > 15) { - cat("ERROR: Item number must be between 1 and 15\n") - return() - } - test_item_idx <- item_num - cat("Specified Item #", test_item_idx, ": ", items[test_item_idx], "\n") - } - - cat("========================================\n\n") - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on random row and random item -cat("\n\n") -qa_check_random_row() # Leave blank for random row & item; specify parameters as needed (see examples below) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW/ITEM ***\n") -cat("For random row AND random item, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118) with random item:\n") -cat(" qa_check_random_row(118)\n") -cat("\nFor random row with specific item (e.g., item 5 = pref_travel):\n") -cat(" qa_check_random_row(item_num = 5)\n") -cat("\nFor specific row AND specific item:\n") -cat(" qa_check_random_row(118, 5)\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP 06 - time interval differences_20251001132540.r b/.history/eohi2/dataP 06 - time interval differences_20251001132540.r deleted file mode 100644 index a4565c2..0000000 --- a/.history/eohi2/dataP 06 - time interval differences_20251001132540.r +++ /dev/null @@ -1,292 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW & ITEM CHECK ============= -# This function can be run multiple times to check different random rows and items - -qa_check_random_row <- function(row_num = NULL, item_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(seq_len(nrow(df)), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - } - - # Pick a random item or use specified item - if (is.null(item_num)) { - test_item_idx <- sample(1:15, 1) - cat("Random Item #", test_item_idx, ": ", items[test_item_idx], "\n") - } else { - if (item_num < 1 || item_num > 15) { - cat("ERROR: Item number must be between 1 and 15\n") - return() - } - test_item_idx <- item_num - cat("Specified Item #", test_item_idx, ": ", items[test_item_idx], "\n") - } - - cat("========================================\n\n") - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on random row and random item -cat("\n\n") -qa_check_random_row() # Leave blank for random row & item; specify parameters as needed (see examples below) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW/ITEM ***\n") -cat("For random row AND random item, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118) with random item:\n") -cat(" qa_check_random_row(118)\n") -cat("\nFor random row with specific item (e.g., item 5 = pref_travel):\n") -cat(" qa_check_random_row(item_num = 5)\n") -cat("\nFor specific row AND specific item:\n") -cat(" qa_check_random_row(118, 5)\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP 06 - time interval differences_20251001132559.r b/.history/eohi2/dataP 06 - time interval differences_20251001132559.r deleted file mode 100644 index a4565c2..0000000 --- a/.history/eohi2/dataP 06 - time interval differences_20251001132559.r +++ /dev/null @@ -1,292 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW & ITEM CHECK ============= -# This function can be run multiple times to check different random rows and items - -qa_check_random_row <- function(row_num = NULL, item_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(seq_len(nrow(df)), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - } - - # Pick a random item or use specified item - if (is.null(item_num)) { - test_item_idx <- sample(1:15, 1) - cat("Random Item #", test_item_idx, ": ", items[test_item_idx], "\n") - } else { - if (item_num < 1 || item_num > 15) { - cat("ERROR: Item number must be between 1 and 15\n") - return() - } - test_item_idx <- item_num - cat("Specified Item #", test_item_idx, ": ", items[test_item_idx], "\n") - } - - cat("========================================\n\n") - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on random row and random item -cat("\n\n") -qa_check_random_row() # Leave blank for random row & item; specify parameters as needed (see examples below) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW/ITEM ***\n") -cat("For random row AND random item, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118) with random item:\n") -cat(" qa_check_random_row(118)\n") -cat("\nFor random row with specific item (e.g., item 5 = pref_travel):\n") -cat(" qa_check_random_row(item_num = 5)\n") -cat("\nFor specific row AND specific item:\n") -cat(" qa_check_random_row(118, 5)\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP 06 - time interval differences_20251001132623.r b/.history/eohi2/dataP 06 - time interval differences_20251001132623.r deleted file mode 100644 index 16b6670..0000000 --- a/.history/eohi2/dataP 06 - time interval differences_20251001132623.r +++ /dev/null @@ -1,292 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW & ITEM CHECK ============= -# This function can be run multiple times to check different random rows and items - -qa_check_random_row <- function(row_num = NULL, item_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(seq_len(nrow(df)), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - } - - # Pick a random item or use specified item - if (is.null(item_num)) { - test_item_idx <- sample(1:15, 1) - cat("Random Item #", test_item_idx, ": ", items[test_item_idx], "\n") - } else { - if (item_num < 1 || item_num > 15) { - cat("ERROR: Item number must be between 1 and 15\n") - return() - } - test_item_idx <- item_num - cat("Specified Item #", test_item_idx, ": ", items[test_item_idx], "\n") - } - - cat("========================================\n\n") - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on random row and random item -cat("\n\n") -qa_check_random_row() # Leave blank for random row & item; specify parameters as needed (see examples below) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW/ITEM ***\n") -cat("For random row AND random item, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118) with random item:\n") -cat(" qa_check_random_row(118)\n") -cat("\nFor random row with specific item (e.g., item 5 = pref_travel):\n") -cat(" qa_check_random_row(item_num = 5)\n") -cat("\nFor specific row AND specific item:\n") -cat(" qa_check_random_row(118, 5)\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP 06 - time interval differences_20251001132907.r b/.history/eohi2/dataP 06 - time interval differences_20251001132907.r deleted file mode 100644 index 16b6670..0000000 --- a/.history/eohi2/dataP 06 - time interval differences_20251001132907.r +++ /dev/null @@ -1,292 +0,0 @@ -# Script to calculate absolute differences between time intervals in eohi2.csv -# Compares present vs past/future, and 5-year vs 10-year intervals - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Note: present uses lowercase "tv", others use uppercase "TV" -items_present <- c( - "pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define all source columns (75 total) -source_cols <- c( - paste0("present_", items_present), - paste0("past_5_", items), - paste0("past_10_", items), - paste0("fut_5_", items), - paste0("fut_10_", items) -) - -# Define all target columns (90 total = 6 calculation types × 15 items) -target_NPast_5 <- paste0("NPast_5_", items) -target_NPast_10 <- paste0("NPast_10_", items) -target_NFut_5 <- paste0("NFut_5_", items) -target_NFut_10 <- paste0("NFut_10_", items) -target_5_10past <- paste0("5.10past_", items) -target_5_10fut <- paste0("5.10fut_", items) - -target_cols <- c( - target_NPast_5, - target_NPast_10, - target_NFut_5, - target_NFut_10, - target_5_10past, - target_5_10fut -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 75 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 10) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 10) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 30) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DIFFERENCES ============= -cat("Calculating time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Helper function to calculate absolute difference -calc_abs_diff <- function(col1, col2) { - val1 <- if (col1 %in% names(df)) df[[col1]] else NA - val2 <- if (col2 %in% names(df)) df[[col2]] else NA - abs(val1 - val2) -} - -# Calculate NPast_5: |present - past_5| -cat(" Calculating NPast_5 differences (present vs past 5 years)...\n") -for (i in 1:15) { - target <- target_NPast_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NPast_10: |present - past_10| -cat(" Calculating NPast_10 differences (present vs past 10 years)...\n") -for (i in 1:15) { - target <- target_NPast_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_5: |present - fut_5| -cat(" Calculating NFut_5 differences (present vs future 5 years)...\n") -for (i in 1:15) { - target <- target_NFut_5[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_5_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate NFut_10: |present - fut_10| -cat(" Calculating NFut_10 differences (present vs future 10 years)...\n") -for (i in 1:15) { - target <- target_NFut_10[i] - source1 <- paste0("present_", items_present[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10past: |past_5 - past_10| -cat(" Calculating 5.10past differences (past 5 vs past 10 years)...\n") -for (i in 1:15) { - target <- target_5_10past[i] - source1 <- paste0("past_5_", items[i]) - source2 <- paste0("past_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -# Calculate 5.10fut: |fut_5 - fut_10| -cat(" Calculating 5.10fut differences (future 5 vs future 10 years)...\n") -for (i in 1:15) { - target <- target_5_10fut[i] - source1 <- paste0("fut_5_", items[i]) - source2 <- paste0("fut_10_", items[i]) - df[[target]] <- calc_abs_diff(source1, source2) -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 90 difference columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW & ITEM CHECK ============= -# This function can be run multiple times to check different random rows and items - -qa_check_random_row <- function(row_num = NULL, item_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(seq_len(nrow(df)), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - } - - # Pick a random item or use specified item - if (is.null(item_num)) { - test_item_idx <- sample(1:15, 1) - cat("Random Item #", test_item_idx, ": ", items[test_item_idx], "\n") - } else { - if (item_num < 1 || item_num > 15) { - cat("ERROR: Item number must be between 1 and 15\n") - return() - } - test_item_idx <- item_num - cat("Specified Item #", test_item_idx, ": ", items[test_item_idx], "\n") - } - - cat("========================================\n\n") - - calculations <- list( - list(name = "NPast_5", target = target_NPast_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_5_", items[test_item_idx]), - desc = "|present - past_5|"), - list(name = "NPast_10", target = target_NPast_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|present - past_10|"), - list(name = "NFut_5", target = target_NFut_5[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_5_", items[test_item_idx]), - desc = "|present - fut_5|"), - list(name = "NFut_10", target = target_NFut_10[test_item_idx], - source1 = paste0("present_", items_present[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|present - fut_10|"), - list(name = "5.10past", target = target_5_10past[test_item_idx], - source1 = paste0("past_5_", items[test_item_idx]), - source2 = paste0("past_10_", items[test_item_idx]), - desc = "|past_5 - past_10|"), - list(name = "5.10fut", target = target_5_10fut[test_item_idx], - source1 = paste0("fut_5_", items[test_item_idx]), - source2 = paste0("fut_10_", items[test_item_idx]), - desc = "|fut_5 - fut_10|") - ) - - for (calc in calculations) { - cat(sprintf("--- %s ---\n", calc$name)) - cat(sprintf("Formula: %s\n", calc$desc)) - - val1 <- if (calc$source1 %in% names(df)) df[random_row, calc$source1] else NA - val2 <- if (calc$source2 %in% names(df)) df[random_row, calc$source2] else NA - target_val <- df[random_row, calc$target] - - cat(sprintf(" %s: %s\n", calc$source1, ifelse(is.na(val1), "NA", as.character(val1)))) - cat(sprintf(" %s: %s\n", calc$source2, ifelse(is.na(val2), "NA", as.character(val2)))) - - if (!is.na(val1) && !is.na(val2)) { - expected_diff <- abs(val1 - val2) - cat(sprintf("\n Calculation: |%.5f - %.5f| = %.5f\n", val1, val2, expected_diff)) - cat(sprintf(" Target (%s): %.5f\n", calc$target, target_val)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_diff - target_val) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" Cannot calculate (missing values)\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on random row and random item -cat("\n\n") -qa_check_random_row() # Leave blank for random row & item; specify parameters as needed (see examples below) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW/ITEM ***\n") -cat("For random row AND random item, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118) with random item:\n") -cat(" qa_check_random_row(118)\n") -cat("\nFor random row with specific item (e.g., item 5 = pref_travel):\n") -cat(" qa_check_random_row(item_num = 5)\n") -cat("\nFor specific row AND specific item:\n") -cat(" qa_check_random_row(118, 5)\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 243 to save changes.\n") -cat("\nProcessing complete! 90 difference columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP 07 - domain means_20251001152954.r b/.history/eohi2/dataP 07 - domain means_20251001152954.r deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi2/dataP 07 - domain means_20251001153004.r b/.history/eohi2/dataP 07 - domain means_20251001153004.r deleted file mode 100644 index 979b420..0000000 --- a/.history/eohi2/dataP 07 - domain means_20251001153004.r +++ /dev/null @@ -1,4 +0,0 @@ -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") \ No newline at end of file diff --git a/.history/eohi2/dataP 07 - domain means_20251001154326.r b/.history/eohi2/dataP 07 - domain means_20251001154326.r deleted file mode 100644 index 5d887a9..0000000 --- a/.history/eohi2/dataP 07 - domain means_20251001154326.r +++ /dev/null @@ -1,265 +0,0 @@ -# Script to calculate domain means for time interval differences in eohi2.csv -# Averages the 5 items within each domain (pref, pers, val) for each time interval type - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define domain groupings (indices in items vector) -pref_indices <- 1:5 -pers_indices <- 6:10 -val_indices <- 11:15 - -# Define time interval prefixes -time_prefixes <- c("NPast_5", "NPast_10", "NFut_5", "NFut_10", "5.10past", "5.10fut") - -# Define domain names -domain_names <- c("pref", "pers", "val") - -# Define all source columns (90 total) -source_cols <- c( - paste0("NPast_5_", items), - paste0("NPast_10_", items), - paste0("NFut_5_", items), - paste0("NFut_10_", items), - paste0("5.10past_", items), - paste0("5.10fut_", items) -) - -# Define all target columns (18 total = 6 time intervals × 3 domains) -target_cols <- c( - paste0("NPast_5_", domain_names, "_MEAN"), - paste0("NPast_10_", domain_names, "_MEAN"), - paste0("NFut_5_", domain_names, "_MEAN"), - paste0("NFut_10_", domain_names, "_MEAN"), - paste0("5.10past_", domain_names, "_MEAN"), - paste0("5.10fut_", domain_names, "_MEAN") -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 20) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 20) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 18 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 45) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DOMAIN MEANS ============= -cat("Calculating domain means for time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Calculate means for each time interval × domain combination -for (time_prefix in time_prefixes) { - # Preferences mean - pref_cols <- paste0(time_prefix, "_", items[pref_indices]) - existing_pref_cols <- pref_cols[pref_cols %in% names(df)] - if (length(existing_pref_cols) > 0) { - df[[paste0(time_prefix, "_pref_MEAN")]] <- rowMeans(df[, existing_pref_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_pref_MEAN"), "\n") - } - - # Personality mean - pers_cols <- paste0(time_prefix, "_", items[pers_indices]) - existing_pers_cols <- pers_cols[pers_cols %in% names(df)] - if (length(existing_pers_cols) > 0) { - df[[paste0(time_prefix, "_pers_MEAN")]] <- rowMeans(df[, existing_pers_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_pers_MEAN"), "\n") - } - - # Values mean - val_cols <- paste0(time_prefix, "_", items[val_indices]) - existing_val_cols <- val_cols[val_cols %in% names(df)] - if (length(existing_val_cols) > 0) { - df[[paste0(time_prefix, "_val_MEAN")]] <- rowMeans(df[, existing_val_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_val_MEAN"), "\n") - } -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 18 domain mean columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW & TIME INTERVAL CHECK ============= -# This function can be run multiple times to check different random rows and time intervals - -qa_check_random_row <- function(row_num = NULL, time_interval_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(seq_len(nrow(df)), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - } - - # Pick a random time interval or use specified interval - if (is.null(time_interval_num)) { - test_interval_idx <- sample(1:6, 1) - cat("Random Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") - } else { - if (time_interval_num < 1 || time_interval_num > 6) { - cat("ERROR: Time interval number must be between 1 and 6\n") - cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5, 4 = NFut_10, 5 = 5.10past, 6 = 5.10fut\n") - return() - } - test_interval_idx <- time_interval_num - cat("Specified Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") - } - - cat("========================================\n\n") - - time_prefix <- time_prefixes[test_interval_idx] - - # Check each of the 3 domains - for (domain_idx in 1:3) { - domain_name <- domain_names[domain_idx] - - # Get the appropriate item indices - if (domain_idx == 1) { - item_indices <- pref_indices - domain_label <- "Preferences" - } else if (domain_idx == 2) { - item_indices <- pers_indices - domain_label <- "Personality" - } else { - item_indices <- val_indices - domain_label <- "Values" - } - - cat(sprintf("--- %s: %s ---\n", time_prefix, domain_label)) - - # Get source column names - source_cols_domain <- paste0(time_prefix, "_", items[item_indices]) - target_col <- paste0(time_prefix, "_", domain_name, "_MEAN") - - # Get values - values <- numeric(5) - cat("Source values:\n") - for (i in 1:5) { - col <- source_cols_domain[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - values[i] <- val - cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", sprintf("%.5f", val)))) - } - - # Calculate expected mean - valid_values <- values[!is.na(values)] - if (length(valid_values) > 0) { - expected_mean <- mean(valid_values) - actual_value <- df[random_row, target_col] - - cat(sprintf("\nCalculation:\n")) - cat(sprintf(" Sum: %s = %.5f\n", - paste(sprintf("%.5f", valid_values), collapse = " + "), - sum(valid_values))) - cat(sprintf(" Average of %d values: %.5f\n", length(valid_values), expected_mean)) - cat(sprintf(" Target (%s): %.5f\n", target_col, actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid values to calculate mean.\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on random row and random time interval -cat("\n\n") -qa_check_random_row() # Leave blank for random row & interval; specify parameters as needed (see examples below) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW/TIME INTERVAL ***\n") -cat("For random row AND random time interval, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118) with random interval:\n") -cat(" qa_check_random_row(118)\n") -cat("\nFor random row with specific interval (e.g., 3 = NFut_5):\n") -cat(" qa_check_random_row(time_interval_num = 3)\n") -cat("\nFor specific row AND specific interval:\n") -cat(" qa_check_random_row(118, 3)\n") -cat("\n") -cat("Time Interval Numbers:\n") -cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5\n") -cat(" 4 = NFut_10, 5 = 5.10past, 6 = 5.10fut\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 234 to save changes.\n") -cat("\nProcessing complete! 18 domain mean columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP 07 - domain means_20251001154444.r b/.history/eohi2/dataP 07 - domain means_20251001154444.r deleted file mode 100644 index 5d887a9..0000000 --- a/.history/eohi2/dataP 07 - domain means_20251001154444.r +++ /dev/null @@ -1,265 +0,0 @@ -# Script to calculate domain means for time interval differences in eohi2.csv -# Averages the 5 items within each domain (pref, pers, val) for each time interval type - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define domain groupings (indices in items vector) -pref_indices <- 1:5 -pers_indices <- 6:10 -val_indices <- 11:15 - -# Define time interval prefixes -time_prefixes <- c("NPast_5", "NPast_10", "NFut_5", "NFut_10", "5.10past", "5.10fut") - -# Define domain names -domain_names <- c("pref", "pers", "val") - -# Define all source columns (90 total) -source_cols <- c( - paste0("NPast_5_", items), - paste0("NPast_10_", items), - paste0("NFut_5_", items), - paste0("NFut_10_", items), - paste0("5.10past_", items), - paste0("5.10fut_", items) -) - -# Define all target columns (18 total = 6 time intervals × 3 domains) -target_cols <- c( - paste0("NPast_5_", domain_names, "_MEAN"), - paste0("NPast_10_", domain_names, "_MEAN"), - paste0("NFut_5_", domain_names, "_MEAN"), - paste0("NFut_10_", domain_names, "_MEAN"), - paste0("5.10past_", domain_names, "_MEAN"), - paste0("5.10fut_", domain_names, "_MEAN") -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 20) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 20) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 18 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 45) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DOMAIN MEANS ============= -cat("Calculating domain means for time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Calculate means for each time interval × domain combination -for (time_prefix in time_prefixes) { - # Preferences mean - pref_cols <- paste0(time_prefix, "_", items[pref_indices]) - existing_pref_cols <- pref_cols[pref_cols %in% names(df)] - if (length(existing_pref_cols) > 0) { - df[[paste0(time_prefix, "_pref_MEAN")]] <- rowMeans(df[, existing_pref_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_pref_MEAN"), "\n") - } - - # Personality mean - pers_cols <- paste0(time_prefix, "_", items[pers_indices]) - existing_pers_cols <- pers_cols[pers_cols %in% names(df)] - if (length(existing_pers_cols) > 0) { - df[[paste0(time_prefix, "_pers_MEAN")]] <- rowMeans(df[, existing_pers_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_pers_MEAN"), "\n") - } - - # Values mean - val_cols <- paste0(time_prefix, "_", items[val_indices]) - existing_val_cols <- val_cols[val_cols %in% names(df)] - if (length(existing_val_cols) > 0) { - df[[paste0(time_prefix, "_val_MEAN")]] <- rowMeans(df[, existing_val_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_val_MEAN"), "\n") - } -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 18 domain mean columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW & TIME INTERVAL CHECK ============= -# This function can be run multiple times to check different random rows and time intervals - -qa_check_random_row <- function(row_num = NULL, time_interval_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(seq_len(nrow(df)), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - } - - # Pick a random time interval or use specified interval - if (is.null(time_interval_num)) { - test_interval_idx <- sample(1:6, 1) - cat("Random Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") - } else { - if (time_interval_num < 1 || time_interval_num > 6) { - cat("ERROR: Time interval number must be between 1 and 6\n") - cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5, 4 = NFut_10, 5 = 5.10past, 6 = 5.10fut\n") - return() - } - test_interval_idx <- time_interval_num - cat("Specified Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") - } - - cat("========================================\n\n") - - time_prefix <- time_prefixes[test_interval_idx] - - # Check each of the 3 domains - for (domain_idx in 1:3) { - domain_name <- domain_names[domain_idx] - - # Get the appropriate item indices - if (domain_idx == 1) { - item_indices <- pref_indices - domain_label <- "Preferences" - } else if (domain_idx == 2) { - item_indices <- pers_indices - domain_label <- "Personality" - } else { - item_indices <- val_indices - domain_label <- "Values" - } - - cat(sprintf("--- %s: %s ---\n", time_prefix, domain_label)) - - # Get source column names - source_cols_domain <- paste0(time_prefix, "_", items[item_indices]) - target_col <- paste0(time_prefix, "_", domain_name, "_MEAN") - - # Get values - values <- numeric(5) - cat("Source values:\n") - for (i in 1:5) { - col <- source_cols_domain[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - values[i] <- val - cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", sprintf("%.5f", val)))) - } - - # Calculate expected mean - valid_values <- values[!is.na(values)] - if (length(valid_values) > 0) { - expected_mean <- mean(valid_values) - actual_value <- df[random_row, target_col] - - cat(sprintf("\nCalculation:\n")) - cat(sprintf(" Sum: %s = %.5f\n", - paste(sprintf("%.5f", valid_values), collapse = " + "), - sum(valid_values))) - cat(sprintf(" Average of %d values: %.5f\n", length(valid_values), expected_mean)) - cat(sprintf(" Target (%s): %.5f\n", target_col, actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid values to calculate mean.\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on random row and random time interval -cat("\n\n") -qa_check_random_row() # Leave blank for random row & interval; specify parameters as needed (see examples below) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW/TIME INTERVAL ***\n") -cat("For random row AND random time interval, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118) with random interval:\n") -cat(" qa_check_random_row(118)\n") -cat("\nFor random row with specific interval (e.g., 3 = NFut_5):\n") -cat(" qa_check_random_row(time_interval_num = 3)\n") -cat("\nFor specific row AND specific interval:\n") -cat(" qa_check_random_row(118, 3)\n") -cat("\n") -cat("Time Interval Numbers:\n") -cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5\n") -cat(" 4 = NFut_10, 5 = 5.10past, 6 = 5.10fut\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 234 to save changes.\n") -cat("\nProcessing complete! 18 domain mean columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP 07 - domain means_20251001155057.r b/.history/eohi2/dataP 07 - domain means_20251001155057.r deleted file mode 100644 index 5d887a9..0000000 --- a/.history/eohi2/dataP 07 - domain means_20251001155057.r +++ /dev/null @@ -1,265 +0,0 @@ -# Script to calculate domain means for time interval differences in eohi2.csv -# Averages the 5 items within each domain (pref, pers, val) for each time interval type - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define domain groupings (indices in items vector) -pref_indices <- 1:5 -pers_indices <- 6:10 -val_indices <- 11:15 - -# Define time interval prefixes -time_prefixes <- c("NPast_5", "NPast_10", "NFut_5", "NFut_10", "5.10past", "5.10fut") - -# Define domain names -domain_names <- c("pref", "pers", "val") - -# Define all source columns (90 total) -source_cols <- c( - paste0("NPast_5_", items), - paste0("NPast_10_", items), - paste0("NFut_5_", items), - paste0("NFut_10_", items), - paste0("5.10past_", items), - paste0("5.10fut_", items) -) - -# Define all target columns (18 total = 6 time intervals × 3 domains) -target_cols <- c( - paste0("NPast_5_", domain_names, "_MEAN"), - paste0("NPast_10_", domain_names, "_MEAN"), - paste0("NFut_5_", domain_names, "_MEAN"), - paste0("NFut_10_", domain_names, "_MEAN"), - paste0("5.10past_", domain_names, "_MEAN"), - paste0("5.10fut_", domain_names, "_MEAN") -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 20) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 20) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 18 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 45) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DOMAIN MEANS ============= -cat("Calculating domain means for time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Calculate means for each time interval × domain combination -for (time_prefix in time_prefixes) { - # Preferences mean - pref_cols <- paste0(time_prefix, "_", items[pref_indices]) - existing_pref_cols <- pref_cols[pref_cols %in% names(df)] - if (length(existing_pref_cols) > 0) { - df[[paste0(time_prefix, "_pref_MEAN")]] <- rowMeans(df[, existing_pref_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_pref_MEAN"), "\n") - } - - # Personality mean - pers_cols <- paste0(time_prefix, "_", items[pers_indices]) - existing_pers_cols <- pers_cols[pers_cols %in% names(df)] - if (length(existing_pers_cols) > 0) { - df[[paste0(time_prefix, "_pers_MEAN")]] <- rowMeans(df[, existing_pers_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_pers_MEAN"), "\n") - } - - # Values mean - val_cols <- paste0(time_prefix, "_", items[val_indices]) - existing_val_cols <- val_cols[val_cols %in% names(df)] - if (length(existing_val_cols) > 0) { - df[[paste0(time_prefix, "_val_MEAN")]] <- rowMeans(df[, existing_val_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_val_MEAN"), "\n") - } -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 18 domain mean columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW & TIME INTERVAL CHECK ============= -# This function can be run multiple times to check different random rows and time intervals - -qa_check_random_row <- function(row_num = NULL, time_interval_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(seq_len(nrow(df)), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - } - - # Pick a random time interval or use specified interval - if (is.null(time_interval_num)) { - test_interval_idx <- sample(1:6, 1) - cat("Random Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") - } else { - if (time_interval_num < 1 || time_interval_num > 6) { - cat("ERROR: Time interval number must be between 1 and 6\n") - cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5, 4 = NFut_10, 5 = 5.10past, 6 = 5.10fut\n") - return() - } - test_interval_idx <- time_interval_num - cat("Specified Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") - } - - cat("========================================\n\n") - - time_prefix <- time_prefixes[test_interval_idx] - - # Check each of the 3 domains - for (domain_idx in 1:3) { - domain_name <- domain_names[domain_idx] - - # Get the appropriate item indices - if (domain_idx == 1) { - item_indices <- pref_indices - domain_label <- "Preferences" - } else if (domain_idx == 2) { - item_indices <- pers_indices - domain_label <- "Personality" - } else { - item_indices <- val_indices - domain_label <- "Values" - } - - cat(sprintf("--- %s: %s ---\n", time_prefix, domain_label)) - - # Get source column names - source_cols_domain <- paste0(time_prefix, "_", items[item_indices]) - target_col <- paste0(time_prefix, "_", domain_name, "_MEAN") - - # Get values - values <- numeric(5) - cat("Source values:\n") - for (i in 1:5) { - col <- source_cols_domain[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - values[i] <- val - cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", sprintf("%.5f", val)))) - } - - # Calculate expected mean - valid_values <- values[!is.na(values)] - if (length(valid_values) > 0) { - expected_mean <- mean(valid_values) - actual_value <- df[random_row, target_col] - - cat(sprintf("\nCalculation:\n")) - cat(sprintf(" Sum: %s = %.5f\n", - paste(sprintf("%.5f", valid_values), collapse = " + "), - sum(valid_values))) - cat(sprintf(" Average of %d values: %.5f\n", length(valid_values), expected_mean)) - cat(sprintf(" Target (%s): %.5f\n", target_col, actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid values to calculate mean.\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on random row and random time interval -cat("\n\n") -qa_check_random_row() # Leave blank for random row & interval; specify parameters as needed (see examples below) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW/TIME INTERVAL ***\n") -cat("For random row AND random time interval, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118) with random interval:\n") -cat(" qa_check_random_row(118)\n") -cat("\nFor random row with specific interval (e.g., 3 = NFut_5):\n") -cat(" qa_check_random_row(time_interval_num = 3)\n") -cat("\nFor specific row AND specific interval:\n") -cat(" qa_check_random_row(118, 3)\n") -cat("\n") -cat("Time Interval Numbers:\n") -cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5\n") -cat(" 4 = NFut_10, 5 = 5.10past, 6 = 5.10fut\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -# write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 234 to save changes.\n") -cat("\nProcessing complete! 18 domain mean columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP 07 - domain means_20251001162547.r b/.history/eohi2/dataP 07 - domain means_20251001162547.r deleted file mode 100644 index 7f3c43d..0000000 --- a/.history/eohi2/dataP 07 - domain means_20251001162547.r +++ /dev/null @@ -1,265 +0,0 @@ -# Script to calculate domain means for time interval differences in eohi2.csv -# Averages the 5 items within each domain (pref, pers, val) for each time interval type - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define domain groupings (indices in items vector) -pref_indices <- 1:5 -pers_indices <- 6:10 -val_indices <- 11:15 - -# Define time interval prefixes -time_prefixes <- c("NPast_5", "NPast_10", "NFut_5", "NFut_10", "5.10past", "5.10fut") - -# Define domain names -domain_names <- c("pref", "pers", "val") - -# Define all source columns (90 total) -source_cols <- c( - paste0("NPast_5_", items), - paste0("NPast_10_", items), - paste0("NFut_5_", items), - paste0("NFut_10_", items), - paste0("5.10past_", items), - paste0("5.10fut_", items) -) - -# Define all target columns (18 total = 6 time intervals × 3 domains) -target_cols <- c( - paste0("NPast_5_", domain_names, "_MEAN"), - paste0("NPast_10_", domain_names, "_MEAN"), - paste0("NFut_5_", domain_names, "_MEAN"), - paste0("NFut_10_", domain_names, "_MEAN"), - paste0("5.10past_", domain_names, "_MEAN"), - paste0("5.10fut_", domain_names, "_MEAN") -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 20) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 20) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 18 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 45) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DOMAIN MEANS ============= -cat("Calculating domain means for time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Calculate means for each time interval × domain combination -for (time_prefix in time_prefixes) { - # Preferences mean - pref_cols <- paste0(time_prefix, "_", items[pref_indices]) - existing_pref_cols <- pref_cols[pref_cols %in% names(df)] - if (length(existing_pref_cols) > 0) { - df[[paste0(time_prefix, "_pref_MEAN")]] <- rowMeans(df[, existing_pref_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_pref_MEAN"), "\n") - } - - # Personality mean - pers_cols <- paste0(time_prefix, "_", items[pers_indices]) - existing_pers_cols <- pers_cols[pers_cols %in% names(df)] - if (length(existing_pers_cols) > 0) { - df[[paste0(time_prefix, "_pers_MEAN")]] <- rowMeans(df[, existing_pers_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_pers_MEAN"), "\n") - } - - # Values mean - val_cols <- paste0(time_prefix, "_", items[val_indices]) - existing_val_cols <- val_cols[val_cols %in% names(df)] - if (length(existing_val_cols) > 0) { - df[[paste0(time_prefix, "_val_MEAN")]] <- rowMeans(df[, existing_val_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_val_MEAN"), "\n") - } -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 18 domain mean columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW & TIME INTERVAL CHECK ============= -# This function can be run multiple times to check different random rows and time intervals - -qa_check_random_row <- function(row_num = NULL, time_interval_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(seq_len(nrow(df)), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - } - - # Pick a random time interval or use specified interval - if (is.null(time_interval_num)) { - test_interval_idx <- sample(1:6, 1) - cat("Random Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") - } else { - if (time_interval_num < 1 || time_interval_num > 6) { - cat("ERROR: Time interval number must be between 1 and 6\n") - cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5, 4 = NFut_10, 5 = 5.10past, 6 = 5.10fut\n") - return() - } - test_interval_idx <- time_interval_num - cat("Specified Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") - } - - cat("========================================\n\n") - - time_prefix <- time_prefixes[test_interval_idx] - - # Check each of the 3 domains - for (domain_idx in 1:3) { - domain_name <- domain_names[domain_idx] - - # Get the appropriate item indices - if (domain_idx == 1) { - item_indices <- pref_indices - domain_label <- "Preferences" - } else if (domain_idx == 2) { - item_indices <- pers_indices - domain_label <- "Personality" - } else { - item_indices <- val_indices - domain_label <- "Values" - } - - cat(sprintf("--- %s: %s ---\n", time_prefix, domain_label)) - - # Get source column names - source_cols_domain <- paste0(time_prefix, "_", items[item_indices]) - target_col <- paste0(time_prefix, "_", domain_name, "_MEAN") - - # Get values - values <- numeric(5) - cat("Source values:\n") - for (i in 1:5) { - col <- source_cols_domain[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - values[i] <- val - cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", sprintf("%.5f", val)))) - } - - # Calculate expected mean - valid_values <- values[!is.na(values)] - if (length(valid_values) > 0) { - expected_mean <- mean(valid_values) - actual_value <- df[random_row, target_col] - - cat(sprintf("\nCalculation:\n")) - cat(sprintf(" Sum: %s = %.5f\n", - paste(sprintf("%.5f", valid_values), collapse = " + "), - sum(valid_values))) - cat(sprintf(" Average of %d values: %.5f\n", length(valid_values), expected_mean)) - cat(sprintf(" Target (%s): %.5f\n", target_col, actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid values to calculate mean.\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on random row and random time interval -cat("\n\n") -qa_check_random_row() # Leave blank for random row & interval; specify parameters as needed (see examples below) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW/TIME INTERVAL ***\n") -cat("For random row AND random time interval, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118) with random interval:\n") -cat(" qa_check_random_row(118)\n") -cat("\nFor random row with specific interval (e.g., 3 = NFut_5):\n") -cat(" qa_check_random_row(time_interval_num = 3)\n") -cat("\nFor specific row AND specific interval:\n") -cat(" qa_check_random_row(118, 3)\n") -cat("\n") -cat("Time Interval Numbers:\n") -cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5\n") -cat(" 4 = NFut_10, 5 = 5.10past, 6 = 5.10fut\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 234 to save changes.\n") -cat("\nProcessing complete! 18 domain mean columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP 07 - domain means_20251001163148.r b/.history/eohi2/dataP 07 - domain means_20251001163148.r deleted file mode 100644 index 7f3c43d..0000000 --- a/.history/eohi2/dataP 07 - domain means_20251001163148.r +++ /dev/null @@ -1,265 +0,0 @@ -# Script to calculate domain means for time interval differences in eohi2.csv -# Averages the 5 items within each domain (pref, pers, val) for each time interval type - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define domain groupings (indices in items vector) -pref_indices <- 1:5 -pers_indices <- 6:10 -val_indices <- 11:15 - -# Define time interval prefixes -time_prefixes <- c("NPast_5", "NPast_10", "NFut_5", "NFut_10", "5.10past", "5.10fut") - -# Define domain names -domain_names <- c("pref", "pers", "val") - -# Define all source columns (90 total) -source_cols <- c( - paste0("NPast_5_", items), - paste0("NPast_10_", items), - paste0("NFut_5_", items), - paste0("NFut_10_", items), - paste0("5.10past_", items), - paste0("5.10fut_", items) -) - -# Define all target columns (18 total = 6 time intervals × 3 domains) -target_cols <- c( - paste0("NPast_5_", domain_names, "_MEAN"), - paste0("NPast_10_", domain_names, "_MEAN"), - paste0("NFut_5_", domain_names, "_MEAN"), - paste0("NFut_10_", domain_names, "_MEAN"), - paste0("5.10past_", domain_names, "_MEAN"), - paste0("5.10fut_", domain_names, "_MEAN") -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 20) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 20) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 18 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 45) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DOMAIN MEANS ============= -cat("Calculating domain means for time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Calculate means for each time interval × domain combination -for (time_prefix in time_prefixes) { - # Preferences mean - pref_cols <- paste0(time_prefix, "_", items[pref_indices]) - existing_pref_cols <- pref_cols[pref_cols %in% names(df)] - if (length(existing_pref_cols) > 0) { - df[[paste0(time_prefix, "_pref_MEAN")]] <- rowMeans(df[, existing_pref_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_pref_MEAN"), "\n") - } - - # Personality mean - pers_cols <- paste0(time_prefix, "_", items[pers_indices]) - existing_pers_cols <- pers_cols[pers_cols %in% names(df)] - if (length(existing_pers_cols) > 0) { - df[[paste0(time_prefix, "_pers_MEAN")]] <- rowMeans(df[, existing_pers_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_pers_MEAN"), "\n") - } - - # Values mean - val_cols <- paste0(time_prefix, "_", items[val_indices]) - existing_val_cols <- val_cols[val_cols %in% names(df)] - if (length(existing_val_cols) > 0) { - df[[paste0(time_prefix, "_val_MEAN")]] <- rowMeans(df[, existing_val_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_val_MEAN"), "\n") - } -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 18 domain mean columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW & TIME INTERVAL CHECK ============= -# This function can be run multiple times to check different random rows and time intervals - -qa_check_random_row <- function(row_num = NULL, time_interval_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(seq_len(nrow(df)), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - } - - # Pick a random time interval or use specified interval - if (is.null(time_interval_num)) { - test_interval_idx <- sample(1:6, 1) - cat("Random Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") - } else { - if (time_interval_num < 1 || time_interval_num > 6) { - cat("ERROR: Time interval number must be between 1 and 6\n") - cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5, 4 = NFut_10, 5 = 5.10past, 6 = 5.10fut\n") - return() - } - test_interval_idx <- time_interval_num - cat("Specified Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") - } - - cat("========================================\n\n") - - time_prefix <- time_prefixes[test_interval_idx] - - # Check each of the 3 domains - for (domain_idx in 1:3) { - domain_name <- domain_names[domain_idx] - - # Get the appropriate item indices - if (domain_idx == 1) { - item_indices <- pref_indices - domain_label <- "Preferences" - } else if (domain_idx == 2) { - item_indices <- pers_indices - domain_label <- "Personality" - } else { - item_indices <- val_indices - domain_label <- "Values" - } - - cat(sprintf("--- %s: %s ---\n", time_prefix, domain_label)) - - # Get source column names - source_cols_domain <- paste0(time_prefix, "_", items[item_indices]) - target_col <- paste0(time_prefix, "_", domain_name, "_MEAN") - - # Get values - values <- numeric(5) - cat("Source values:\n") - for (i in 1:5) { - col <- source_cols_domain[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - values[i] <- val - cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", sprintf("%.5f", val)))) - } - - # Calculate expected mean - valid_values <- values[!is.na(values)] - if (length(valid_values) > 0) { - expected_mean <- mean(valid_values) - actual_value <- df[random_row, target_col] - - cat(sprintf("\nCalculation:\n")) - cat(sprintf(" Sum: %s = %.5f\n", - paste(sprintf("%.5f", valid_values), collapse = " + "), - sum(valid_values))) - cat(sprintf(" Average of %d values: %.5f\n", length(valid_values), expected_mean)) - cat(sprintf(" Target (%s): %.5f\n", target_col, actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid values to calculate mean.\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on random row and random time interval -cat("\n\n") -qa_check_random_row() # Leave blank for random row & interval; specify parameters as needed (see examples below) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW/TIME INTERVAL ***\n") -cat("For random row AND random time interval, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118) with random interval:\n") -cat(" qa_check_random_row(118)\n") -cat("\nFor random row with specific interval (e.g., 3 = NFut_5):\n") -cat(" qa_check_random_row(time_interval_num = 3)\n") -cat("\nFor specific row AND specific interval:\n") -cat(" qa_check_random_row(118, 3)\n") -cat("\n") -cat("Time Interval Numbers:\n") -cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5\n") -cat(" 4 = NFut_10, 5 = 5.10past, 6 = 5.10fut\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 234 to save changes.\n") -cat("\nProcessing complete! 18 domain mean columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP 07 - domain means_20251008193329.r b/.history/eohi2/dataP 07 - domain means_20251008193329.r deleted file mode 100644 index 74f4b7d..0000000 --- a/.history/eohi2/dataP 07 - domain means_20251008193329.r +++ /dev/null @@ -1,265 +0,0 @@ -# Script to calculate domain means for time interval differences in eohi2.csv -# Averages the 5 items within each domain (pref, pers, val) for each time interval type - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define domain groupings (indices in items vector) -pref_indices <- 1:5 -pers_indices <- 6:10 -val_indices <- 11:15 - -# Define time interval prefixes -time_prefixes <- c("NPast_5", "NPast_10", "NFut_5", "NFut_10", "X5.10past", "X5.10fut") - -# Define domain names -domain_names <- c("pref", "pers", "val") - -# Define all source columns (90 total) -source_cols <- c( - paste0("NPast_5_", items), - paste0("NPast_10_", items), - paste0("NFut_5_", items), - paste0("NFut_10_", items), - paste0("5.10past_", items), - paste0("5.10fut_", items) -) - -# Define all target columns (18 total = 6 time intervals × 3 domains) -target_cols <- c( - paste0("NPast_5_", domain_names, "_MEAN"), - paste0("NPast_10_", domain_names, "_MEAN"), - paste0("NFut_5_", domain_names, "_MEAN"), - paste0("NFut_10_", domain_names, "_MEAN"), - paste0("5.10past_", domain_names, "_MEAN"), - paste0("5.10fut_", domain_names, "_MEAN") -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 20) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 20) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 18 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 45) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DOMAIN MEANS ============= -cat("Calculating domain means for time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Calculate means for each time interval × domain combination -for (time_prefix in time_prefixes) { - # Preferences mean - pref_cols <- paste0(time_prefix, "_", items[pref_indices]) - existing_pref_cols <- pref_cols[pref_cols %in% names(df)] - if (length(existing_pref_cols) > 0) { - df[[paste0(time_prefix, "_pref_MEAN")]] <- rowMeans(df[, existing_pref_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_pref_MEAN"), "\n") - } - - # Personality mean - pers_cols <- paste0(time_prefix, "_", items[pers_indices]) - existing_pers_cols <- pers_cols[pers_cols %in% names(df)] - if (length(existing_pers_cols) > 0) { - df[[paste0(time_prefix, "_pers_MEAN")]] <- rowMeans(df[, existing_pers_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_pers_MEAN"), "\n") - } - - # Values mean - val_cols <- paste0(time_prefix, "_", items[val_indices]) - existing_val_cols <- val_cols[val_cols %in% names(df)] - if (length(existing_val_cols) > 0) { - df[[paste0(time_prefix, "_val_MEAN")]] <- rowMeans(df[, existing_val_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_val_MEAN"), "\n") - } -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 18 domain mean columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW & TIME INTERVAL CHECK ============= -# This function can be run multiple times to check different random rows and time intervals - -qa_check_random_row <- function(row_num = NULL, time_interval_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(seq_len(nrow(df)), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - } - - # Pick a random time interval or use specified interval - if (is.null(time_interval_num)) { - test_interval_idx <- sample(1:6, 1) - cat("Random Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") - } else { - if (time_interval_num < 1 || time_interval_num > 6) { - cat("ERROR: Time interval number must be between 1 and 6\n") - cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5, 4 = NFut_10, 5 = 5.10past, 6 = 5.10fut\n") - return() - } - test_interval_idx <- time_interval_num - cat("Specified Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") - } - - cat("========================================\n\n") - - time_prefix <- time_prefixes[test_interval_idx] - - # Check each of the 3 domains - for (domain_idx in 1:3) { - domain_name <- domain_names[domain_idx] - - # Get the appropriate item indices - if (domain_idx == 1) { - item_indices <- pref_indices - domain_label <- "Preferences" - } else if (domain_idx == 2) { - item_indices <- pers_indices - domain_label <- "Personality" - } else { - item_indices <- val_indices - domain_label <- "Values" - } - - cat(sprintf("--- %s: %s ---\n", time_prefix, domain_label)) - - # Get source column names - source_cols_domain <- paste0(time_prefix, "_", items[item_indices]) - target_col <- paste0(time_prefix, "_", domain_name, "_MEAN") - - # Get values - values <- numeric(5) - cat("Source values:\n") - for (i in 1:5) { - col <- source_cols_domain[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - values[i] <- val - cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", sprintf("%.5f", val)))) - } - - # Calculate expected mean - valid_values <- values[!is.na(values)] - if (length(valid_values) > 0) { - expected_mean <- mean(valid_values) - actual_value <- df[random_row, target_col] - - cat(sprintf("\nCalculation:\n")) - cat(sprintf(" Sum: %s = %.5f\n", - paste(sprintf("%.5f", valid_values), collapse = " + "), - sum(valid_values))) - cat(sprintf(" Average of %d values: %.5f\n", length(valid_values), expected_mean)) - cat(sprintf(" Target (%s): %.5f\n", target_col, actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid values to calculate mean.\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on random row and random time interval -cat("\n\n") -qa_check_random_row() # Leave blank for random row & interval; specify parameters as needed (see examples below) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW/TIME INTERVAL ***\n") -cat("For random row AND random time interval, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118) with random interval:\n") -cat(" qa_check_random_row(118)\n") -cat("\nFor random row with specific interval (e.g., 3 = NFut_5):\n") -cat(" qa_check_random_row(time_interval_num = 3)\n") -cat("\nFor specific row AND specific interval:\n") -cat(" qa_check_random_row(118, 3)\n") -cat("\n") -cat("Time Interval Numbers:\n") -cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5\n") -cat(" 4 = NFut_10, 5 = 5.10past, 6 = 5.10fut\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 234 to save changes.\n") -cat("\nProcessing complete! 18 domain mean columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP 07 - domain means_20251008193335.r b/.history/eohi2/dataP 07 - domain means_20251008193335.r deleted file mode 100644 index d7e861c..0000000 --- a/.history/eohi2/dataP 07 - domain means_20251008193335.r +++ /dev/null @@ -1,265 +0,0 @@ -# Script to calculate domain means for time interval differences in eohi2.csv -# Averages the 5 items within each domain (pref, pers, val) for each time interval type - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define domain groupings (indices in items vector) -pref_indices <- 1:5 -pers_indices <- 6:10 -val_indices <- 11:15 - -# Define time interval prefixes -time_prefixes <- c("NPast_5", "NPast_10", "NFut_5", "NFut_10", "X5.10past", "X5.10fut") - -# Define domain names -domain_names <- c("pref", "pers", "val") - -# Define all source columns (90 total) -source_cols <- c( - paste0("NPast_5_", items), - paste0("NPast_10_", items), - paste0("NFut_5_", items), - paste0("NFut_10_", items), - paste0("X5.10past_", items), - paste0("X5.10fut_", items) -) - -# Define all target columns (18 total = 6 time intervals × 3 domains) -target_cols <- c( - paste0("NPast_5_", domain_names, "_MEAN"), - paste0("NPast_10_", domain_names, "_MEAN"), - paste0("NFut_5_", domain_names, "_MEAN"), - paste0("NFut_10_", domain_names, "_MEAN"), - paste0("5.10past_", domain_names, "_MEAN"), - paste0("5.10fut_", domain_names, "_MEAN") -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 20) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 20) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 18 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 45) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DOMAIN MEANS ============= -cat("Calculating domain means for time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Calculate means for each time interval × domain combination -for (time_prefix in time_prefixes) { - # Preferences mean - pref_cols <- paste0(time_prefix, "_", items[pref_indices]) - existing_pref_cols <- pref_cols[pref_cols %in% names(df)] - if (length(existing_pref_cols) > 0) { - df[[paste0(time_prefix, "_pref_MEAN")]] <- rowMeans(df[, existing_pref_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_pref_MEAN"), "\n") - } - - # Personality mean - pers_cols <- paste0(time_prefix, "_", items[pers_indices]) - existing_pers_cols <- pers_cols[pers_cols %in% names(df)] - if (length(existing_pers_cols) > 0) { - df[[paste0(time_prefix, "_pers_MEAN")]] <- rowMeans(df[, existing_pers_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_pers_MEAN"), "\n") - } - - # Values mean - val_cols <- paste0(time_prefix, "_", items[val_indices]) - existing_val_cols <- val_cols[val_cols %in% names(df)] - if (length(existing_val_cols) > 0) { - df[[paste0(time_prefix, "_val_MEAN")]] <- rowMeans(df[, existing_val_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_val_MEAN"), "\n") - } -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 18 domain mean columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW & TIME INTERVAL CHECK ============= -# This function can be run multiple times to check different random rows and time intervals - -qa_check_random_row <- function(row_num = NULL, time_interval_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(seq_len(nrow(df)), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - } - - # Pick a random time interval or use specified interval - if (is.null(time_interval_num)) { - test_interval_idx <- sample(1:6, 1) - cat("Random Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") - } else { - if (time_interval_num < 1 || time_interval_num > 6) { - cat("ERROR: Time interval number must be between 1 and 6\n") - cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5, 4 = NFut_10, 5 = 5.10past, 6 = 5.10fut\n") - return() - } - test_interval_idx <- time_interval_num - cat("Specified Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") - } - - cat("========================================\n\n") - - time_prefix <- time_prefixes[test_interval_idx] - - # Check each of the 3 domains - for (domain_idx in 1:3) { - domain_name <- domain_names[domain_idx] - - # Get the appropriate item indices - if (domain_idx == 1) { - item_indices <- pref_indices - domain_label <- "Preferences" - } else if (domain_idx == 2) { - item_indices <- pers_indices - domain_label <- "Personality" - } else { - item_indices <- val_indices - domain_label <- "Values" - } - - cat(sprintf("--- %s: %s ---\n", time_prefix, domain_label)) - - # Get source column names - source_cols_domain <- paste0(time_prefix, "_", items[item_indices]) - target_col <- paste0(time_prefix, "_", domain_name, "_MEAN") - - # Get values - values <- numeric(5) - cat("Source values:\n") - for (i in 1:5) { - col <- source_cols_domain[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - values[i] <- val - cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", sprintf("%.5f", val)))) - } - - # Calculate expected mean - valid_values <- values[!is.na(values)] - if (length(valid_values) > 0) { - expected_mean <- mean(valid_values) - actual_value <- df[random_row, target_col] - - cat(sprintf("\nCalculation:\n")) - cat(sprintf(" Sum: %s = %.5f\n", - paste(sprintf("%.5f", valid_values), collapse = " + "), - sum(valid_values))) - cat(sprintf(" Average of %d values: %.5f\n", length(valid_values), expected_mean)) - cat(sprintf(" Target (%s): %.5f\n", target_col, actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid values to calculate mean.\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on random row and random time interval -cat("\n\n") -qa_check_random_row() # Leave blank for random row & interval; specify parameters as needed (see examples below) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW/TIME INTERVAL ***\n") -cat("For random row AND random time interval, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118) with random interval:\n") -cat(" qa_check_random_row(118)\n") -cat("\nFor random row with specific interval (e.g., 3 = NFut_5):\n") -cat(" qa_check_random_row(time_interval_num = 3)\n") -cat("\nFor specific row AND specific interval:\n") -cat(" qa_check_random_row(118, 3)\n") -cat("\n") -cat("Time Interval Numbers:\n") -cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5\n") -cat(" 4 = NFut_10, 5 = 5.10past, 6 = 5.10fut\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 234 to save changes.\n") -cat("\nProcessing complete! 18 domain mean columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP 07 - domain means_20251008193341.r b/.history/eohi2/dataP 07 - domain means_20251008193341.r deleted file mode 100644 index e08d460..0000000 --- a/.history/eohi2/dataP 07 - domain means_20251008193341.r +++ /dev/null @@ -1,265 +0,0 @@ -# Script to calculate domain means for time interval differences in eohi2.csv -# Averages the 5 items within each domain (pref, pers, val) for each time interval type - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define domain groupings (indices in items vector) -pref_indices <- 1:5 -pers_indices <- 6:10 -val_indices <- 11:15 - -# Define time interval prefixes -time_prefixes <- c("NPast_5", "NPast_10", "NFut_5", "NFut_10", "X5.10past", "X5.10fut") - -# Define domain names -domain_names <- c("pref", "pers", "val") - -# Define all source columns (90 total) -source_cols <- c( - paste0("NPast_5_", items), - paste0("NPast_10_", items), - paste0("NFut_5_", items), - paste0("NFut_10_", items), - paste0("X5.10past_", items), - paste0("X5.10fut_", items) -) - -# Define all target columns (18 total = 6 time intervals × 3 domains) -target_cols <- c( - paste0("NPast_5_", domain_names, "_MEAN"), - paste0("NPast_10_", domain_names, "_MEAN"), - paste0("NFut_5_", domain_names, "_MEAN"), - paste0("NFut_10_", domain_names, "_MEAN"), - paste0("X5.10past_", domain_names, "_MEAN"), - paste0("X5.10fut_", domain_names, "_MEAN") -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 20) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 20) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 18 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 45) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DOMAIN MEANS ============= -cat("Calculating domain means for time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Calculate means for each time interval × domain combination -for (time_prefix in time_prefixes) { - # Preferences mean - pref_cols <- paste0(time_prefix, "_", items[pref_indices]) - existing_pref_cols <- pref_cols[pref_cols %in% names(df)] - if (length(existing_pref_cols) > 0) { - df[[paste0(time_prefix, "_pref_MEAN")]] <- rowMeans(df[, existing_pref_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_pref_MEAN"), "\n") - } - - # Personality mean - pers_cols <- paste0(time_prefix, "_", items[pers_indices]) - existing_pers_cols <- pers_cols[pers_cols %in% names(df)] - if (length(existing_pers_cols) > 0) { - df[[paste0(time_prefix, "_pers_MEAN")]] <- rowMeans(df[, existing_pers_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_pers_MEAN"), "\n") - } - - # Values mean - val_cols <- paste0(time_prefix, "_", items[val_indices]) - existing_val_cols <- val_cols[val_cols %in% names(df)] - if (length(existing_val_cols) > 0) { - df[[paste0(time_prefix, "_val_MEAN")]] <- rowMeans(df[, existing_val_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_val_MEAN"), "\n") - } -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 18 domain mean columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW & TIME INTERVAL CHECK ============= -# This function can be run multiple times to check different random rows and time intervals - -qa_check_random_row <- function(row_num = NULL, time_interval_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(seq_len(nrow(df)), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - } - - # Pick a random time interval or use specified interval - if (is.null(time_interval_num)) { - test_interval_idx <- sample(1:6, 1) - cat("Random Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") - } else { - if (time_interval_num < 1 || time_interval_num > 6) { - cat("ERROR: Time interval number must be between 1 and 6\n") - cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5, 4 = NFut_10, 5 = 5.10past, 6 = 5.10fut\n") - return() - } - test_interval_idx <- time_interval_num - cat("Specified Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") - } - - cat("========================================\n\n") - - time_prefix <- time_prefixes[test_interval_idx] - - # Check each of the 3 domains - for (domain_idx in 1:3) { - domain_name <- domain_names[domain_idx] - - # Get the appropriate item indices - if (domain_idx == 1) { - item_indices <- pref_indices - domain_label <- "Preferences" - } else if (domain_idx == 2) { - item_indices <- pers_indices - domain_label <- "Personality" - } else { - item_indices <- val_indices - domain_label <- "Values" - } - - cat(sprintf("--- %s: %s ---\n", time_prefix, domain_label)) - - # Get source column names - source_cols_domain <- paste0(time_prefix, "_", items[item_indices]) - target_col <- paste0(time_prefix, "_", domain_name, "_MEAN") - - # Get values - values <- numeric(5) - cat("Source values:\n") - for (i in 1:5) { - col <- source_cols_domain[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - values[i] <- val - cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", sprintf("%.5f", val)))) - } - - # Calculate expected mean - valid_values <- values[!is.na(values)] - if (length(valid_values) > 0) { - expected_mean <- mean(valid_values) - actual_value <- df[random_row, target_col] - - cat(sprintf("\nCalculation:\n")) - cat(sprintf(" Sum: %s = %.5f\n", - paste(sprintf("%.5f", valid_values), collapse = " + "), - sum(valid_values))) - cat(sprintf(" Average of %d values: %.5f\n", length(valid_values), expected_mean)) - cat(sprintf(" Target (%s): %.5f\n", target_col, actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid values to calculate mean.\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on random row and random time interval -cat("\n\n") -qa_check_random_row() # Leave blank for random row & interval; specify parameters as needed (see examples below) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW/TIME INTERVAL ***\n") -cat("For random row AND random time interval, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118) with random interval:\n") -cat(" qa_check_random_row(118)\n") -cat("\nFor random row with specific interval (e.g., 3 = NFut_5):\n") -cat(" qa_check_random_row(time_interval_num = 3)\n") -cat("\nFor specific row AND specific interval:\n") -cat(" qa_check_random_row(118, 3)\n") -cat("\n") -cat("Time Interval Numbers:\n") -cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5\n") -cat(" 4 = NFut_10, 5 = 5.10past, 6 = 5.10fut\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 234 to save changes.\n") -cat("\nProcessing complete! 18 domain mean columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP 07 - domain means_20251008193347.r b/.history/eohi2/dataP 07 - domain means_20251008193347.r deleted file mode 100644 index 0cf8049..0000000 --- a/.history/eohi2/dataP 07 - domain means_20251008193347.r +++ /dev/null @@ -1,265 +0,0 @@ -# Script to calculate domain means for time interval differences in eohi2.csv -# Averages the 5 items within each domain (pref, pers, val) for each time interval type - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define domain groupings (indices in items vector) -pref_indices <- 1:5 -pers_indices <- 6:10 -val_indices <- 11:15 - -# Define time interval prefixes -time_prefixes <- c("NPast_5", "NPast_10", "NFut_5", "NFut_10", "X5.10past", "X5.10fut") - -# Define domain names -domain_names <- c("pref", "pers", "val") - -# Define all source columns (90 total) -source_cols <- c( - paste0("NPast_5_", items), - paste0("NPast_10_", items), - paste0("NFut_5_", items), - paste0("NFut_10_", items), - paste0("X5.10past_", items), - paste0("X5.10fut_", items) -) - -# Define all target columns (18 total = 6 time intervals × 3 domains) -target_cols <- c( - paste0("NPast_5_", domain_names, "_MEAN"), - paste0("NPast_10_", domain_names, "_MEAN"), - paste0("NFut_5_", domain_names, "_MEAN"), - paste0("NFut_10_", domain_names, "_MEAN"), - paste0("X5.10past_", domain_names, "_MEAN"), - paste0("X5.10fut_", domain_names, "_MEAN") -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 20) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 20) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 18 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 45) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DOMAIN MEANS ============= -cat("Calculating domain means for time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Calculate means for each time interval × domain combination -for (time_prefix in time_prefixes) { - # Preferences mean - pref_cols <- paste0(time_prefix, "_", items[pref_indices]) - existing_pref_cols <- pref_cols[pref_cols %in% names(df)] - if (length(existing_pref_cols) > 0) { - df[[paste0(time_prefix, "_pref_MEAN")]] <- rowMeans(df[, existing_pref_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_pref_MEAN"), "\n") - } - - # Personality mean - pers_cols <- paste0(time_prefix, "_", items[pers_indices]) - existing_pers_cols <- pers_cols[pers_cols %in% names(df)] - if (length(existing_pers_cols) > 0) { - df[[paste0(time_prefix, "_pers_MEAN")]] <- rowMeans(df[, existing_pers_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_pers_MEAN"), "\n") - } - - # Values mean - val_cols <- paste0(time_prefix, "_", items[val_indices]) - existing_val_cols <- val_cols[val_cols %in% names(df)] - if (length(existing_val_cols) > 0) { - df[[paste0(time_prefix, "_val_MEAN")]] <- rowMeans(df[, existing_val_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_val_MEAN"), "\n") - } -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 18 domain mean columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW & TIME INTERVAL CHECK ============= -# This function can be run multiple times to check different random rows and time intervals - -qa_check_random_row <- function(row_num = NULL, time_interval_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(seq_len(nrow(df)), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - } - - # Pick a random time interval or use specified interval - if (is.null(time_interval_num)) { - test_interval_idx <- sample(1:6, 1) - cat("Random Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") - } else { - if (time_interval_num < 1 || time_interval_num > 6) { - cat("ERROR: Time interval number must be between 1 and 6\n") - cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5, 4 = NFut_10, 5 = X5.10past, 6 = X5.10fut\n") - return() - } - test_interval_idx <- time_interval_num - cat("Specified Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") - } - - cat("========================================\n\n") - - time_prefix <- time_prefixes[test_interval_idx] - - # Check each of the 3 domains - for (domain_idx in 1:3) { - domain_name <- domain_names[domain_idx] - - # Get the appropriate item indices - if (domain_idx == 1) { - item_indices <- pref_indices - domain_label <- "Preferences" - } else if (domain_idx == 2) { - item_indices <- pers_indices - domain_label <- "Personality" - } else { - item_indices <- val_indices - domain_label <- "Values" - } - - cat(sprintf("--- %s: %s ---\n", time_prefix, domain_label)) - - # Get source column names - source_cols_domain <- paste0(time_prefix, "_", items[item_indices]) - target_col <- paste0(time_prefix, "_", domain_name, "_MEAN") - - # Get values - values <- numeric(5) - cat("Source values:\n") - for (i in 1:5) { - col <- source_cols_domain[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - values[i] <- val - cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", sprintf("%.5f", val)))) - } - - # Calculate expected mean - valid_values <- values[!is.na(values)] - if (length(valid_values) > 0) { - expected_mean <- mean(valid_values) - actual_value <- df[random_row, target_col] - - cat(sprintf("\nCalculation:\n")) - cat(sprintf(" Sum: %s = %.5f\n", - paste(sprintf("%.5f", valid_values), collapse = " + "), - sum(valid_values))) - cat(sprintf(" Average of %d values: %.5f\n", length(valid_values), expected_mean)) - cat(sprintf(" Target (%s): %.5f\n", target_col, actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid values to calculate mean.\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on random row and random time interval -cat("\n\n") -qa_check_random_row() # Leave blank for random row & interval; specify parameters as needed (see examples below) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW/TIME INTERVAL ***\n") -cat("For random row AND random time interval, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118) with random interval:\n") -cat(" qa_check_random_row(118)\n") -cat("\nFor random row with specific interval (e.g., 3 = NFut_5):\n") -cat(" qa_check_random_row(time_interval_num = 3)\n") -cat("\nFor specific row AND specific interval:\n") -cat(" qa_check_random_row(118, 3)\n") -cat("\n") -cat("Time Interval Numbers:\n") -cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5\n") -cat(" 4 = NFut_10, 5 = 5.10past, 6 = 5.10fut\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 234 to save changes.\n") -cat("\nProcessing complete! 18 domain mean columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP 07 - domain means_20251008193352.r b/.history/eohi2/dataP 07 - domain means_20251008193352.r deleted file mode 100644 index eead82b..0000000 --- a/.history/eohi2/dataP 07 - domain means_20251008193352.r +++ /dev/null @@ -1,265 +0,0 @@ -# Script to calculate domain means for time interval differences in eohi2.csv -# Averages the 5 items within each domain (pref, pers, val) for each time interval type - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define domain groupings (indices in items vector) -pref_indices <- 1:5 -pers_indices <- 6:10 -val_indices <- 11:15 - -# Define time interval prefixes -time_prefixes <- c("NPast_5", "NPast_10", "NFut_5", "NFut_10", "X5.10past", "X5.10fut") - -# Define domain names -domain_names <- c("pref", "pers", "val") - -# Define all source columns (90 total) -source_cols <- c( - paste0("NPast_5_", items), - paste0("NPast_10_", items), - paste0("NFut_5_", items), - paste0("NFut_10_", items), - paste0("X5.10past_", items), - paste0("X5.10fut_", items) -) - -# Define all target columns (18 total = 6 time intervals × 3 domains) -target_cols <- c( - paste0("NPast_5_", domain_names, "_MEAN"), - paste0("NPast_10_", domain_names, "_MEAN"), - paste0("NFut_5_", domain_names, "_MEAN"), - paste0("NFut_10_", domain_names, "_MEAN"), - paste0("X5.10past_", domain_names, "_MEAN"), - paste0("X5.10fut_", domain_names, "_MEAN") -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 20) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 20) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 18 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 45) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DOMAIN MEANS ============= -cat("Calculating domain means for time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Calculate means for each time interval × domain combination -for (time_prefix in time_prefixes) { - # Preferences mean - pref_cols <- paste0(time_prefix, "_", items[pref_indices]) - existing_pref_cols <- pref_cols[pref_cols %in% names(df)] - if (length(existing_pref_cols) > 0) { - df[[paste0(time_prefix, "_pref_MEAN")]] <- rowMeans(df[, existing_pref_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_pref_MEAN"), "\n") - } - - # Personality mean - pers_cols <- paste0(time_prefix, "_", items[pers_indices]) - existing_pers_cols <- pers_cols[pers_cols %in% names(df)] - if (length(existing_pers_cols) > 0) { - df[[paste0(time_prefix, "_pers_MEAN")]] <- rowMeans(df[, existing_pers_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_pers_MEAN"), "\n") - } - - # Values mean - val_cols <- paste0(time_prefix, "_", items[val_indices]) - existing_val_cols <- val_cols[val_cols %in% names(df)] - if (length(existing_val_cols) > 0) { - df[[paste0(time_prefix, "_val_MEAN")]] <- rowMeans(df[, existing_val_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_val_MEAN"), "\n") - } -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 18 domain mean columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW & TIME INTERVAL CHECK ============= -# This function can be run multiple times to check different random rows and time intervals - -qa_check_random_row <- function(row_num = NULL, time_interval_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(seq_len(nrow(df)), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - } - - # Pick a random time interval or use specified interval - if (is.null(time_interval_num)) { - test_interval_idx <- sample(1:6, 1) - cat("Random Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") - } else { - if (time_interval_num < 1 || time_interval_num > 6) { - cat("ERROR: Time interval number must be between 1 and 6\n") - cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5, 4 = NFut_10, 5 = X5.10past, 6 = X5.10fut\n") - return() - } - test_interval_idx <- time_interval_num - cat("Specified Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") - } - - cat("========================================\n\n") - - time_prefix <- time_prefixes[test_interval_idx] - - # Check each of the 3 domains - for (domain_idx in 1:3) { - domain_name <- domain_names[domain_idx] - - # Get the appropriate item indices - if (domain_idx == 1) { - item_indices <- pref_indices - domain_label <- "Preferences" - } else if (domain_idx == 2) { - item_indices <- pers_indices - domain_label <- "Personality" - } else { - item_indices <- val_indices - domain_label <- "Values" - } - - cat(sprintf("--- %s: %s ---\n", time_prefix, domain_label)) - - # Get source column names - source_cols_domain <- paste0(time_prefix, "_", items[item_indices]) - target_col <- paste0(time_prefix, "_", domain_name, "_MEAN") - - # Get values - values <- numeric(5) - cat("Source values:\n") - for (i in 1:5) { - col <- source_cols_domain[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - values[i] <- val - cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", sprintf("%.5f", val)))) - } - - # Calculate expected mean - valid_values <- values[!is.na(values)] - if (length(valid_values) > 0) { - expected_mean <- mean(valid_values) - actual_value <- df[random_row, target_col] - - cat(sprintf("\nCalculation:\n")) - cat(sprintf(" Sum: %s = %.5f\n", - paste(sprintf("%.5f", valid_values), collapse = " + "), - sum(valid_values))) - cat(sprintf(" Average of %d values: %.5f\n", length(valid_values), expected_mean)) - cat(sprintf(" Target (%s): %.5f\n", target_col, actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid values to calculate mean.\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on random row and random time interval -cat("\n\n") -qa_check_random_row() # Leave blank for random row & interval; specify parameters as needed (see examples below) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW/TIME INTERVAL ***\n") -cat("For random row AND random time interval, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118) with random interval:\n") -cat(" qa_check_random_row(118)\n") -cat("\nFor random row with specific interval (e.g., 3 = NFut_5):\n") -cat(" qa_check_random_row(time_interval_num = 3)\n") -cat("\nFor specific row AND specific interval:\n") -cat(" qa_check_random_row(118, 3)\n") -cat("\n") -cat("Time Interval Numbers:\n") -cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5\n") -cat(" 4 = NFut_10, 5 = X5.10past, 6 = X5.10fut\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 234 to save changes.\n") -cat("\nProcessing complete! 18 domain mean columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP 07 - domain means_20251008193557.r b/.history/eohi2/dataP 07 - domain means_20251008193557.r deleted file mode 100644 index eead82b..0000000 --- a/.history/eohi2/dataP 07 - domain means_20251008193557.r +++ /dev/null @@ -1,265 +0,0 @@ -# Script to calculate domain means for time interval differences in eohi2.csv -# Averages the 5 items within each domain (pref, pers, val) for each time interval type - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define domain groupings (indices in items vector) -pref_indices <- 1:5 -pers_indices <- 6:10 -val_indices <- 11:15 - -# Define time interval prefixes -time_prefixes <- c("NPast_5", "NPast_10", "NFut_5", "NFut_10", "X5.10past", "X5.10fut") - -# Define domain names -domain_names <- c("pref", "pers", "val") - -# Define all source columns (90 total) -source_cols <- c( - paste0("NPast_5_", items), - paste0("NPast_10_", items), - paste0("NFut_5_", items), - paste0("NFut_10_", items), - paste0("X5.10past_", items), - paste0("X5.10fut_", items) -) - -# Define all target columns (18 total = 6 time intervals × 3 domains) -target_cols <- c( - paste0("NPast_5_", domain_names, "_MEAN"), - paste0("NPast_10_", domain_names, "_MEAN"), - paste0("NFut_5_", domain_names, "_MEAN"), - paste0("NFut_10_", domain_names, "_MEAN"), - paste0("X5.10past_", domain_names, "_MEAN"), - paste0("X5.10fut_", domain_names, "_MEAN") -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 20) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 20) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 18 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 45) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DOMAIN MEANS ============= -cat("Calculating domain means for time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Calculate means for each time interval × domain combination -for (time_prefix in time_prefixes) { - # Preferences mean - pref_cols <- paste0(time_prefix, "_", items[pref_indices]) - existing_pref_cols <- pref_cols[pref_cols %in% names(df)] - if (length(existing_pref_cols) > 0) { - df[[paste0(time_prefix, "_pref_MEAN")]] <- rowMeans(df[, existing_pref_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_pref_MEAN"), "\n") - } - - # Personality mean - pers_cols <- paste0(time_prefix, "_", items[pers_indices]) - existing_pers_cols <- pers_cols[pers_cols %in% names(df)] - if (length(existing_pers_cols) > 0) { - df[[paste0(time_prefix, "_pers_MEAN")]] <- rowMeans(df[, existing_pers_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_pers_MEAN"), "\n") - } - - # Values mean - val_cols <- paste0(time_prefix, "_", items[val_indices]) - existing_val_cols <- val_cols[val_cols %in% names(df)] - if (length(existing_val_cols) > 0) { - df[[paste0(time_prefix, "_val_MEAN")]] <- rowMeans(df[, existing_val_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_val_MEAN"), "\n") - } -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 18 domain mean columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW & TIME INTERVAL CHECK ============= -# This function can be run multiple times to check different random rows and time intervals - -qa_check_random_row <- function(row_num = NULL, time_interval_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(seq_len(nrow(df)), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - } - - # Pick a random time interval or use specified interval - if (is.null(time_interval_num)) { - test_interval_idx <- sample(1:6, 1) - cat("Random Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") - } else { - if (time_interval_num < 1 || time_interval_num > 6) { - cat("ERROR: Time interval number must be between 1 and 6\n") - cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5, 4 = NFut_10, 5 = X5.10past, 6 = X5.10fut\n") - return() - } - test_interval_idx <- time_interval_num - cat("Specified Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") - } - - cat("========================================\n\n") - - time_prefix <- time_prefixes[test_interval_idx] - - # Check each of the 3 domains - for (domain_idx in 1:3) { - domain_name <- domain_names[domain_idx] - - # Get the appropriate item indices - if (domain_idx == 1) { - item_indices <- pref_indices - domain_label <- "Preferences" - } else if (domain_idx == 2) { - item_indices <- pers_indices - domain_label <- "Personality" - } else { - item_indices <- val_indices - domain_label <- "Values" - } - - cat(sprintf("--- %s: %s ---\n", time_prefix, domain_label)) - - # Get source column names - source_cols_domain <- paste0(time_prefix, "_", items[item_indices]) - target_col <- paste0(time_prefix, "_", domain_name, "_MEAN") - - # Get values - values <- numeric(5) - cat("Source values:\n") - for (i in 1:5) { - col <- source_cols_domain[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - values[i] <- val - cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", sprintf("%.5f", val)))) - } - - # Calculate expected mean - valid_values <- values[!is.na(values)] - if (length(valid_values) > 0) { - expected_mean <- mean(valid_values) - actual_value <- df[random_row, target_col] - - cat(sprintf("\nCalculation:\n")) - cat(sprintf(" Sum: %s = %.5f\n", - paste(sprintf("%.5f", valid_values), collapse = " + "), - sum(valid_values))) - cat(sprintf(" Average of %d values: %.5f\n", length(valid_values), expected_mean)) - cat(sprintf(" Target (%s): %.5f\n", target_col, actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid values to calculate mean.\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on random row and random time interval -cat("\n\n") -qa_check_random_row() # Leave blank for random row & interval; specify parameters as needed (see examples below) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW/TIME INTERVAL ***\n") -cat("For random row AND random time interval, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118) with random interval:\n") -cat(" qa_check_random_row(118)\n") -cat("\nFor random row with specific interval (e.g., 3 = NFut_5):\n") -cat(" qa_check_random_row(time_interval_num = 3)\n") -cat("\nFor specific row AND specific interval:\n") -cat(" qa_check_random_row(118, 3)\n") -cat("\n") -cat("Time Interval Numbers:\n") -cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5\n") -cat(" 4 = NFut_10, 5 = X5.10past, 6 = X5.10fut\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 234 to save changes.\n") -cat("\nProcessing complete! 18 domain mean columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP 07 - domain means_20251008193600.r b/.history/eohi2/dataP 07 - domain means_20251008193600.r deleted file mode 100644 index eead82b..0000000 --- a/.history/eohi2/dataP 07 - domain means_20251008193600.r +++ /dev/null @@ -1,265 +0,0 @@ -# Script to calculate domain means for time interval differences in eohi2.csv -# Averages the 5 items within each domain (pref, pers, val) for each time interval type - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings=NULL keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the 15 item names (same order for all time periods) -items <- c( - "pref_read", "pref_music", "pref_TV", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice" -) - -# Define domain groupings (indices in items vector) -pref_indices <- 1:5 -pers_indices <- 6:10 -val_indices <- 11:15 - -# Define time interval prefixes -time_prefixes <- c("NPast_5", "NPast_10", "NFut_5", "NFut_10", "X5.10past", "X5.10fut") - -# Define domain names -domain_names <- c("pref", "pers", "val") - -# Define all source columns (90 total) -source_cols <- c( - paste0("NPast_5_", items), - paste0("NPast_10_", items), - paste0("NFut_5_", items), - paste0("NFut_10_", items), - paste0("X5.10past_", items), - paste0("X5.10fut_", items) -) - -# Define all target columns (18 total = 6 time intervals × 3 domains) -target_cols <- c( - paste0("NPast_5_", domain_names, "_MEAN"), - paste0("NPast_10_", domain_names, "_MEAN"), - paste0("NFut_5_", domain_names, "_MEAN"), - paste0("NFut_10_", domain_names, "_MEAN"), - paste0("X5.10past_", domain_names, "_MEAN"), - paste0("X5.10fut_", domain_names, "_MEAN") -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source columns -missing_source <- source_cols[!source_cols %in% df_cols] -existing_source <- source_cols[source_cols %in% df_cols] - -cat("Source Columns:\n") -cat(" Expected: 90 columns\n") -cat(" Found:", length(existing_source), "columns\n") -cat(" Missing:", length(missing_source), "columns\n") - -if (length(missing_source) > 0 && length(missing_source) <= 20) { - cat("\n Missing source columns:\n") - for (col in missing_source) { - cat(" -", col, "\n") - } -} else if (length(missing_source) > 20) { - cat("\n Too many missing to list individually (", length(missing_source), "missing)\n") -} - -# Check Target columns -missing_targets <- target_cols[!target_cols %in% df_cols] -existing_targets <- target_cols[target_cols %in% df_cols] - -cat("\nTarget Columns:\n") -cat(" Expected: 18 columns\n") -cat(" Found:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n Target columns do NOT exist yet - will create them.\n") - if (length(existing_targets) > 0) { - cat(" WARNING: Some target columns already exist and will be overwritten.\n") - } -} else { - cat(" All target columns exist - will overwrite with calculated values.\n") -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_source) > 45) { - stop("ERROR: Too many source columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# ============= CALCULATE DOMAIN MEANS ============= -cat("Calculating domain means for time interval differences...\n") - -# Convert source columns to numeric -for (col in source_cols) { - if (col %in% names(df)) { - df[[col]] <- as.numeric(df[[col]]) - } -} - -# Calculate means for each time interval × domain combination -for (time_prefix in time_prefixes) { - # Preferences mean - pref_cols <- paste0(time_prefix, "_", items[pref_indices]) - existing_pref_cols <- pref_cols[pref_cols %in% names(df)] - if (length(existing_pref_cols) > 0) { - df[[paste0(time_prefix, "_pref_MEAN")]] <- rowMeans(df[, existing_pref_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_pref_MEAN"), "\n") - } - - # Personality mean - pers_cols <- paste0(time_prefix, "_", items[pers_indices]) - existing_pers_cols <- pers_cols[pers_cols %in% names(df)] - if (length(existing_pers_cols) > 0) { - df[[paste0(time_prefix, "_pers_MEAN")]] <- rowMeans(df[, existing_pers_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_pers_MEAN"), "\n") - } - - # Values mean - val_cols <- paste0(time_prefix, "_", items[val_indices]) - existing_val_cols <- val_cols[val_cols %in% names(df)] - if (length(existing_val_cols) > 0) { - df[[paste0(time_prefix, "_val_MEAN")]] <- rowMeans(df[, existing_val_cols, drop = FALSE], na.rm = TRUE) - cat(" Processed:", paste0(time_prefix, "_val_MEAN"), "\n") - } -} - -cat("\n=== CALCULATION COMPLETE ===\n") -cat(" 18 domain mean columns created.\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW & TIME INTERVAL CHECK ============= -# This function can be run multiple times to check different random rows and time intervals - -qa_check_random_row <- function(row_num = NULL, time_interval_num = NULL) { - # Pick a random row or use specified row - if (is.null(row_num)) { - random_row <- sample(seq_len(nrow(df)), 1) - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - } else { - if (row_num < 1 || row_num > nrow(df)) { - cat("ERROR: Row number must be between 1 and", nrow(df), "\n") - return() - } - random_row <- row_num - cat("\n========================================\n") - cat("QA CHECK: Specified Row #", random_row, "\n") - } - - # Pick a random time interval or use specified interval - if (is.null(time_interval_num)) { - test_interval_idx <- sample(1:6, 1) - cat("Random Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") - } else { - if (time_interval_num < 1 || time_interval_num > 6) { - cat("ERROR: Time interval number must be between 1 and 6\n") - cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5, 4 = NFut_10, 5 = X5.10past, 6 = X5.10fut\n") - return() - } - test_interval_idx <- time_interval_num - cat("Specified Time Interval #", test_interval_idx, ": ", time_prefixes[test_interval_idx], "\n") - } - - cat("========================================\n\n") - - time_prefix <- time_prefixes[test_interval_idx] - - # Check each of the 3 domains - for (domain_idx in 1:3) { - domain_name <- domain_names[domain_idx] - - # Get the appropriate item indices - if (domain_idx == 1) { - item_indices <- pref_indices - domain_label <- "Preferences" - } else if (domain_idx == 2) { - item_indices <- pers_indices - domain_label <- "Personality" - } else { - item_indices <- val_indices - domain_label <- "Values" - } - - cat(sprintf("--- %s: %s ---\n", time_prefix, domain_label)) - - # Get source column names - source_cols_domain <- paste0(time_prefix, "_", items[item_indices]) - target_col <- paste0(time_prefix, "_", domain_name, "_MEAN") - - # Get values - values <- numeric(5) - cat("Source values:\n") - for (i in 1:5) { - col <- source_cols_domain[i] - val <- if (col %in% names(df)) df[random_row, col] else NA - values[i] <- val - cat(sprintf(" %s: %s\n", col, ifelse(is.na(val), "NA", sprintf("%.5f", val)))) - } - - # Calculate expected mean - valid_values <- values[!is.na(values)] - if (length(valid_values) > 0) { - expected_mean <- mean(valid_values) - actual_value <- df[random_row, target_col] - - cat(sprintf("\nCalculation:\n")) - cat(sprintf(" Sum: %s = %.5f\n", - paste(sprintf("%.5f", valid_values), collapse = " + "), - sum(valid_values))) - cat(sprintf(" Average of %d values: %.5f\n", length(valid_values), expected_mean)) - cat(sprintf(" Target (%s): %.5f\n", target_col, actual_value)) - cat(sprintf(" Match: %s\n", ifelse(abs(expected_mean - actual_value) < 0.0001, "YES ✓", "NO ✗"))) - } else { - cat(" No valid values to calculate mean.\n") - } - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on random row and random time interval -cat("\n\n") -qa_check_random_row() # Leave blank for random row & interval; specify parameters as needed (see examples below) - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER ROW/TIME INTERVAL ***\n") -cat("For random row AND random time interval, run:\n") -cat(" qa_check_random_row()\n") -cat("\nFor specific row (e.g., row 118) with random interval:\n") -cat(" qa_check_random_row(118)\n") -cat("\nFor random row with specific interval (e.g., 3 = NFut_5):\n") -cat(" qa_check_random_row(time_interval_num = 3)\n") -cat("\nFor specific row AND specific interval:\n") -cat(" qa_check_random_row(118, 3)\n") -cat("\n") -cat("Time Interval Numbers:\n") -cat(" 1 = NPast_5, 2 = NPast_10, 3 = NFut_5\n") -cat(" 4 = NFut_10, 5 = X5.10past, 6 = X5.10fut\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -# COMMENTED OUT FOR REVIEW - Uncomment when ready to save -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\n*** WRITE TO FILE IS COMMENTED OUT ***\n") -cat("Review the output above, then uncomment line 234 to save changes.\n") -cat("\nProcessing complete! 18 domain mean columns calculated (not yet saved to file).\n") diff --git a/.history/eohi2/dataP 08 - DGEN 510 vars_20251006194349.r b/.history/eohi2/dataP 08 - DGEN 510 vars_20251006194349.r deleted file mode 100644 index a9815f3..0000000 --- a/.history/eohi2/dataP 08 - DGEN 510 vars_20251006194349.r +++ /dev/null @@ -1,94 +0,0 @@ -# Script 08: Create 5_10 DGEN Variables -# PURPOSE: Calculate absolute differences between 5-year and 10-year DGEN ratings -# for both Past and Future directions -# VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify source columns exist -source_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- source_vars[!source_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - stop(paste("ERROR: Missing source variables:", paste(missing_vars, collapse = ", "))) -} - -print("All source DGEN variables found!") - -# Calculate 5_10 DGEN variables (absolute differences between 5-year and 10-year) -# Formula: |DGEN_5 - DGEN_10| - -# PAST direction -data$`5_10DGEN_past_pref` <- abs(data$DGEN_past_5_Pref - data$DGEN_past_10_Pref) -data$`5_10DGEN_past_pers` <- abs(data$DGEN_past_5_Pers - data$DGEN_past_10_Pers) -data$`5_10DGEN_past_val` <- abs(data$DGEN_past_5_Val - data$DGEN_past_10_Val) - -# FUTURE direction -data$`5_10DGEN_fut_pref` <- abs(data$DGEN_fut_5_Pref - data$DGEN_fut_10_Pref) -data$`5_10DGEN_fut_pers` <- abs(data$DGEN_fut_5_Pers - data$DGEN_fut_10_Pers) -data$`5_10DGEN_fut_val` <- abs(data$DGEN_fut_5_Val - data$DGEN_fut_10_Val) - -# Verify variables were created -target_vars <- c("5_10DGEN_past_pref", "5_10DGEN_past_pers", "5_10DGEN_past_val", - "5_10DGEN_fut_pref", "5_10DGEN_fut_pers", "5_10DGEN_fut_val") - -print("\n=== VARIABLES CREATED ===") -print(target_vars) - -# Check for missing values -for(var in target_vars) { - n_missing <- sum(is.na(data[[var]])) - pct_missing <- round(100 * n_missing / nrow(data), 2) - print(sprintf("%s: %d missing (%.2f%%)", var, n_missing, pct_missing)) -} - -# Quality check: Display sample rows -print("\n=== QUALITY CHECK: Sample Calculations ===") -sample_rows <- sample(1:nrow(data), min(5, nrow(data))) - -for(i in sample_rows) { - print(sprintf("\nRow %d:", i)) - print(sprintf(" DGEN_past_5_Pref = %.2f, DGEN_past_10_Pref = %.2f", - data$DGEN_past_5_Pref[i], data$DGEN_past_10_Pref[i])) - print(sprintf(" → 5_10DGEN_past_pref = %.2f (expected: %.2f)", - data$`5_10DGEN_past_pref`[i], - abs(data$DGEN_past_5_Pref[i] - data$DGEN_past_10_Pref[i]))) - - print(sprintf(" DGEN_fut_5_Pers = %.2f, DGEN_fut_10_Pers = %.2f", - data$DGEN_fut_5_Pers[i], data$DGEN_fut_10_Pers[i])) - print(sprintf(" → 5_10DGEN_fut_pers = %.2f (expected: %.2f)", - data$`5_10DGEN_fut_pers`[i], - abs(data$DGEN_fut_5_Pers[i] - data$DGEN_fut_10_Pers[i]))) -} - -# Descriptive statistics -print("\n=== DESCRIPTIVE STATISTICS ===") -desc_stats <- data %>% - summarise(across(all_of(target_vars), - list(n = ~sum(!is.na(.)), - mean = ~round(mean(., na.rm = TRUE), 5), - sd = ~round(sd(., na.rm = TRUE), 5), - min = ~round(min(., na.rm = TRUE), 5), - max = ~round(max(., na.rm = TRUE), 5)), - .names = "{.col}_{.fn}")) - -print(t(desc_stats)) - -# Save to CSV -write.csv(data, "eohi2.csv", row.names = FALSE) - -print("\n=== PROCESSING COMPLETE ===") -print("Data saved to eohi2.csv") -print(paste("Total columns now:", ncol(data))) - diff --git a/.history/eohi2/dataP 08 - DGEN 510 vars_20251006194411.r b/.history/eohi2/dataP 08 - DGEN 510 vars_20251006194411.r deleted file mode 100644 index a9815f3..0000000 --- a/.history/eohi2/dataP 08 - DGEN 510 vars_20251006194411.r +++ /dev/null @@ -1,94 +0,0 @@ -# Script 08: Create 5_10 DGEN Variables -# PURPOSE: Calculate absolute differences between 5-year and 10-year DGEN ratings -# for both Past and Future directions -# VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify source columns exist -source_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- source_vars[!source_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - stop(paste("ERROR: Missing source variables:", paste(missing_vars, collapse = ", "))) -} - -print("All source DGEN variables found!") - -# Calculate 5_10 DGEN variables (absolute differences between 5-year and 10-year) -# Formula: |DGEN_5 - DGEN_10| - -# PAST direction -data$`5_10DGEN_past_pref` <- abs(data$DGEN_past_5_Pref - data$DGEN_past_10_Pref) -data$`5_10DGEN_past_pers` <- abs(data$DGEN_past_5_Pers - data$DGEN_past_10_Pers) -data$`5_10DGEN_past_val` <- abs(data$DGEN_past_5_Val - data$DGEN_past_10_Val) - -# FUTURE direction -data$`5_10DGEN_fut_pref` <- abs(data$DGEN_fut_5_Pref - data$DGEN_fut_10_Pref) -data$`5_10DGEN_fut_pers` <- abs(data$DGEN_fut_5_Pers - data$DGEN_fut_10_Pers) -data$`5_10DGEN_fut_val` <- abs(data$DGEN_fut_5_Val - data$DGEN_fut_10_Val) - -# Verify variables were created -target_vars <- c("5_10DGEN_past_pref", "5_10DGEN_past_pers", "5_10DGEN_past_val", - "5_10DGEN_fut_pref", "5_10DGEN_fut_pers", "5_10DGEN_fut_val") - -print("\n=== VARIABLES CREATED ===") -print(target_vars) - -# Check for missing values -for(var in target_vars) { - n_missing <- sum(is.na(data[[var]])) - pct_missing <- round(100 * n_missing / nrow(data), 2) - print(sprintf("%s: %d missing (%.2f%%)", var, n_missing, pct_missing)) -} - -# Quality check: Display sample rows -print("\n=== QUALITY CHECK: Sample Calculations ===") -sample_rows <- sample(1:nrow(data), min(5, nrow(data))) - -for(i in sample_rows) { - print(sprintf("\nRow %d:", i)) - print(sprintf(" DGEN_past_5_Pref = %.2f, DGEN_past_10_Pref = %.2f", - data$DGEN_past_5_Pref[i], data$DGEN_past_10_Pref[i])) - print(sprintf(" → 5_10DGEN_past_pref = %.2f (expected: %.2f)", - data$`5_10DGEN_past_pref`[i], - abs(data$DGEN_past_5_Pref[i] - data$DGEN_past_10_Pref[i]))) - - print(sprintf(" DGEN_fut_5_Pers = %.2f, DGEN_fut_10_Pers = %.2f", - data$DGEN_fut_5_Pers[i], data$DGEN_fut_10_Pers[i])) - print(sprintf(" → 5_10DGEN_fut_pers = %.2f (expected: %.2f)", - data$`5_10DGEN_fut_pers`[i], - abs(data$DGEN_fut_5_Pers[i] - data$DGEN_fut_10_Pers[i]))) -} - -# Descriptive statistics -print("\n=== DESCRIPTIVE STATISTICS ===") -desc_stats <- data %>% - summarise(across(all_of(target_vars), - list(n = ~sum(!is.na(.)), - mean = ~round(mean(., na.rm = TRUE), 5), - sd = ~round(sd(., na.rm = TRUE), 5), - min = ~round(min(., na.rm = TRUE), 5), - max = ~round(max(., na.rm = TRUE), 5)), - .names = "{.col}_{.fn}")) - -print(t(desc_stats)) - -# Save to CSV -write.csv(data, "eohi2.csv", row.names = FALSE) - -print("\n=== PROCESSING COMPLETE ===") -print("Data saved to eohi2.csv") -print(paste("Total columns now:", ncol(data))) - diff --git a/.history/eohi2/dataP 08 - DGEN 510 vars_20251006194451.r b/.history/eohi2/dataP 08 - DGEN 510 vars_20251006194451.r deleted file mode 100644 index a9815f3..0000000 --- a/.history/eohi2/dataP 08 - DGEN 510 vars_20251006194451.r +++ /dev/null @@ -1,94 +0,0 @@ -# Script 08: Create 5_10 DGEN Variables -# PURPOSE: Calculate absolute differences between 5-year and 10-year DGEN ratings -# for both Past and Future directions -# VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify source columns exist -source_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- source_vars[!source_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - stop(paste("ERROR: Missing source variables:", paste(missing_vars, collapse = ", "))) -} - -print("All source DGEN variables found!") - -# Calculate 5_10 DGEN variables (absolute differences between 5-year and 10-year) -# Formula: |DGEN_5 - DGEN_10| - -# PAST direction -data$`5_10DGEN_past_pref` <- abs(data$DGEN_past_5_Pref - data$DGEN_past_10_Pref) -data$`5_10DGEN_past_pers` <- abs(data$DGEN_past_5_Pers - data$DGEN_past_10_Pers) -data$`5_10DGEN_past_val` <- abs(data$DGEN_past_5_Val - data$DGEN_past_10_Val) - -# FUTURE direction -data$`5_10DGEN_fut_pref` <- abs(data$DGEN_fut_5_Pref - data$DGEN_fut_10_Pref) -data$`5_10DGEN_fut_pers` <- abs(data$DGEN_fut_5_Pers - data$DGEN_fut_10_Pers) -data$`5_10DGEN_fut_val` <- abs(data$DGEN_fut_5_Val - data$DGEN_fut_10_Val) - -# Verify variables were created -target_vars <- c("5_10DGEN_past_pref", "5_10DGEN_past_pers", "5_10DGEN_past_val", - "5_10DGEN_fut_pref", "5_10DGEN_fut_pers", "5_10DGEN_fut_val") - -print("\n=== VARIABLES CREATED ===") -print(target_vars) - -# Check for missing values -for(var in target_vars) { - n_missing <- sum(is.na(data[[var]])) - pct_missing <- round(100 * n_missing / nrow(data), 2) - print(sprintf("%s: %d missing (%.2f%%)", var, n_missing, pct_missing)) -} - -# Quality check: Display sample rows -print("\n=== QUALITY CHECK: Sample Calculations ===") -sample_rows <- sample(1:nrow(data), min(5, nrow(data))) - -for(i in sample_rows) { - print(sprintf("\nRow %d:", i)) - print(sprintf(" DGEN_past_5_Pref = %.2f, DGEN_past_10_Pref = %.2f", - data$DGEN_past_5_Pref[i], data$DGEN_past_10_Pref[i])) - print(sprintf(" → 5_10DGEN_past_pref = %.2f (expected: %.2f)", - data$`5_10DGEN_past_pref`[i], - abs(data$DGEN_past_5_Pref[i] - data$DGEN_past_10_Pref[i]))) - - print(sprintf(" DGEN_fut_5_Pers = %.2f, DGEN_fut_10_Pers = %.2f", - data$DGEN_fut_5_Pers[i], data$DGEN_fut_10_Pers[i])) - print(sprintf(" → 5_10DGEN_fut_pers = %.2f (expected: %.2f)", - data$`5_10DGEN_fut_pers`[i], - abs(data$DGEN_fut_5_Pers[i] - data$DGEN_fut_10_Pers[i]))) -} - -# Descriptive statistics -print("\n=== DESCRIPTIVE STATISTICS ===") -desc_stats <- data %>% - summarise(across(all_of(target_vars), - list(n = ~sum(!is.na(.)), - mean = ~round(mean(., na.rm = TRUE), 5), - sd = ~round(sd(., na.rm = TRUE), 5), - min = ~round(min(., na.rm = TRUE), 5), - max = ~round(max(., na.rm = TRUE), 5)), - .names = "{.col}_{.fn}")) - -print(t(desc_stats)) - -# Save to CSV -write.csv(data, "eohi2.csv", row.names = FALSE) - -print("\n=== PROCESSING COMPLETE ===") -print("Data saved to eohi2.csv") -print(paste("Total columns now:", ncol(data))) - diff --git a/.history/eohi2/dataP 08 - DGEN 510 vars_20251006195055.r b/.history/eohi2/dataP 08 - DGEN 510 vars_20251006195055.r deleted file mode 100644 index ce186a2..0000000 --- a/.history/eohi2/dataP 08 - DGEN 510 vars_20251006195055.r +++ /dev/null @@ -1,95 +0,0 @@ -# Script 08: Create 5_10 DGEN Variables -# PURPOSE: Calculate absolute differences between 5-year and 10-year DGEN ratings -# for both Past and Future directions -# VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify source columns exist -source_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- source_vars[!source_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - stop(paste("ERROR: Missing source variables:", paste(missing_vars, collapse = ", "))) -} - -print("All source DGEN variables found!") - -# Calculate 5_10 DGEN variables (absolute differences between 5-year and 10-year) -# Formula: |DGEN_5 - DGEN_10| -# NOTE: Using X prefix because R adds it to column names starting with numbers - -# PAST direction -data$X5_10DGEN_past_pref <- abs(data$DGEN_past_5_Pref - data$DGEN_past_10_Pref) -data$X5_10DGEN_past_pers <- abs(data$DGEN_past_5_Pers - data$DGEN_past_10_Pers) -data$X5_10DGEN_past_val <- abs(data$DGEN_past_5_Val - data$DGEN_past_10_Val) - -# FUTURE direction -data$X5_10DGEN_fut_pref <- abs(data$DGEN_fut_5_Pref - data$DGEN_fut_10_Pref) -data$X5_10DGEN_fut_pers <- abs(data$DGEN_fut_5_Pers - data$DGEN_fut_10_Pers) -data$X5_10DGEN_fut_val <- abs(data$DGEN_fut_5_Val - data$DGEN_fut_10_Val) - -# Verify variables were created -target_vars <- c("X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -print("\n=== VARIABLES CREATED ===") -print(target_vars) - -# Check for missing values -for(var in target_vars) { - n_missing <- sum(is.na(data[[var]])) - pct_missing <- round(100 * n_missing / nrow(data), 2) - print(sprintf("%s: %d missing (%.2f%%)", var, n_missing, pct_missing)) -} - -# Quality check: Display sample rows -print("\n=== QUALITY CHECK: Sample Calculations ===") -sample_rows <- sample(1:nrow(data), min(5, nrow(data))) - -for(i in sample_rows) { - print(sprintf("\nRow %d:", i)) - print(sprintf(" DGEN_past_5_Pref = %.2f, DGEN_past_10_Pref = %.2f", - data$DGEN_past_5_Pref[i], data$DGEN_past_10_Pref[i])) - print(sprintf(" → 5_10DGEN_past_pref = %.2f (expected: %.2f)", - data$`5_10DGEN_past_pref`[i], - abs(data$DGEN_past_5_Pref[i] - data$DGEN_past_10_Pref[i]))) - - print(sprintf(" DGEN_fut_5_Pers = %.2f, DGEN_fut_10_Pers = %.2f", - data$DGEN_fut_5_Pers[i], data$DGEN_fut_10_Pers[i])) - print(sprintf(" → 5_10DGEN_fut_pers = %.2f (expected: %.2f)", - data$`5_10DGEN_fut_pers`[i], - abs(data$DGEN_fut_5_Pers[i] - data$DGEN_fut_10_Pers[i]))) -} - -# Descriptive statistics -print("\n=== DESCRIPTIVE STATISTICS ===") -desc_stats <- data %>% - summarise(across(all_of(target_vars), - list(n = ~sum(!is.na(.)), - mean = ~round(mean(., na.rm = TRUE), 5), - sd = ~round(sd(., na.rm = TRUE), 5), - min = ~round(min(., na.rm = TRUE), 5), - max = ~round(max(., na.rm = TRUE), 5)), - .names = "{.col}_{.fn}")) - -print(t(desc_stats)) - -# Save to CSV -write.csv(data, "eohi2.csv", row.names = FALSE) - -print("\n=== PROCESSING COMPLETE ===") -print("Data saved to eohi2.csv") -print(paste("Total columns now:", ncol(data))) - diff --git a/.history/eohi2/dataP 08 - DGEN 510 vars_20251006195109.r b/.history/eohi2/dataP 08 - DGEN 510 vars_20251006195109.r deleted file mode 100644 index d50239d..0000000 --- a/.history/eohi2/dataP 08 - DGEN 510 vars_20251006195109.r +++ /dev/null @@ -1,95 +0,0 @@ -# Script 08: Create 5_10 DGEN Variables -# PURPOSE: Calculate absolute differences between 5-year and 10-year DGEN ratings -# for both Past and Future directions -# VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify source columns exist -source_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- source_vars[!source_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - stop(paste("ERROR: Missing source variables:", paste(missing_vars, collapse = ", "))) -} - -print("All source DGEN variables found!") - -# Calculate 5_10 DGEN variables (absolute differences between 5-year and 10-year) -# Formula: |DGEN_5 - DGEN_10| -# NOTE: Using X prefix because R adds it to column names starting with numbers - -# PAST direction -data$X5_10DGEN_past_pref <- abs(data$DGEN_past_5_Pref - data$DGEN_past_10_Pref) -data$X5_10DGEN_past_pers <- abs(data$DGEN_past_5_Pers - data$DGEN_past_10_Pers) -data$X5_10DGEN_past_val <- abs(data$DGEN_past_5_Val - data$DGEN_past_10_Val) - -# FUTURE direction -data$X5_10DGEN_fut_pref <- abs(data$DGEN_fut_5_Pref - data$DGEN_fut_10_Pref) -data$X5_10DGEN_fut_pers <- abs(data$DGEN_fut_5_Pers - data$DGEN_fut_10_Pers) -data$X5_10DGEN_fut_val <- abs(data$DGEN_fut_5_Val - data$DGEN_fut_10_Val) - -# Verify variables were created -target_vars <- c("X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -print("\n=== VARIABLES CREATED ===") -print(target_vars) - -# Check for missing values -for(var in target_vars) { - n_missing <- sum(is.na(data[[var]])) - pct_missing <- round(100 * n_missing / nrow(data), 2) - print(sprintf("%s: %d missing (%.2f%%)", var, n_missing, pct_missing)) -} - -# Quality check: Display sample rows -print("\n=== QUALITY CHECK: Sample Calculations ===") -sample_rows <- sample(1:nrow(data), min(5, nrow(data))) - -for(i in sample_rows) { - print(sprintf("\nRow %d:", i)) - print(sprintf(" DGEN_past_5_Pref = %.2f, DGEN_past_10_Pref = %.2f", - data$DGEN_past_5_Pref[i], data$DGEN_past_10_Pref[i])) - print(sprintf(" → X5_10DGEN_past_pref = %.2f (expected: %.2f)", - data$X5_10DGEN_past_pref[i], - abs(data$DGEN_past_5_Pref[i] - data$DGEN_past_10_Pref[i]))) - - print(sprintf(" DGEN_fut_5_Pers = %.2f, DGEN_fut_10_Pers = %.2f", - data$DGEN_fut_5_Pers[i], data$DGEN_fut_10_Pers[i])) - print(sprintf(" → X5_10DGEN_fut_pers = %.2f (expected: %.2f)", - data$X5_10DGEN_fut_pers[i], - abs(data$DGEN_fut_5_Pers[i] - data$DGEN_fut_10_Pers[i]))) -} - -# Descriptive statistics -print("\n=== DESCRIPTIVE STATISTICS ===") -desc_stats <- data %>% - summarise(across(all_of(target_vars), - list(n = ~sum(!is.na(.)), - mean = ~round(mean(., na.rm = TRUE), 5), - sd = ~round(sd(., na.rm = TRUE), 5), - min = ~round(min(., na.rm = TRUE), 5), - max = ~round(max(., na.rm = TRUE), 5)), - .names = "{.col}_{.fn}")) - -print(t(desc_stats)) - -# Save to CSV -write.csv(data, "eohi2.csv", row.names = FALSE) - -print("\n=== PROCESSING COMPLETE ===") -print("Data saved to eohi2.csv") -print(paste("Total columns now:", ncol(data))) - diff --git a/.history/eohi2/dataP 08 - DGEN 510 vars_20251006195118.r b/.history/eohi2/dataP 08 - DGEN 510 vars_20251006195118.r deleted file mode 100644 index d50239d..0000000 --- a/.history/eohi2/dataP 08 - DGEN 510 vars_20251006195118.r +++ /dev/null @@ -1,95 +0,0 @@ -# Script 08: Create 5_10 DGEN Variables -# PURPOSE: Calculate absolute differences between 5-year and 10-year DGEN ratings -# for both Past and Future directions -# VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify source columns exist -source_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- source_vars[!source_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - stop(paste("ERROR: Missing source variables:", paste(missing_vars, collapse = ", "))) -} - -print("All source DGEN variables found!") - -# Calculate 5_10 DGEN variables (absolute differences between 5-year and 10-year) -# Formula: |DGEN_5 - DGEN_10| -# NOTE: Using X prefix because R adds it to column names starting with numbers - -# PAST direction -data$X5_10DGEN_past_pref <- abs(data$DGEN_past_5_Pref - data$DGEN_past_10_Pref) -data$X5_10DGEN_past_pers <- abs(data$DGEN_past_5_Pers - data$DGEN_past_10_Pers) -data$X5_10DGEN_past_val <- abs(data$DGEN_past_5_Val - data$DGEN_past_10_Val) - -# FUTURE direction -data$X5_10DGEN_fut_pref <- abs(data$DGEN_fut_5_Pref - data$DGEN_fut_10_Pref) -data$X5_10DGEN_fut_pers <- abs(data$DGEN_fut_5_Pers - data$DGEN_fut_10_Pers) -data$X5_10DGEN_fut_val <- abs(data$DGEN_fut_5_Val - data$DGEN_fut_10_Val) - -# Verify variables were created -target_vars <- c("X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -print("\n=== VARIABLES CREATED ===") -print(target_vars) - -# Check for missing values -for(var in target_vars) { - n_missing <- sum(is.na(data[[var]])) - pct_missing <- round(100 * n_missing / nrow(data), 2) - print(sprintf("%s: %d missing (%.2f%%)", var, n_missing, pct_missing)) -} - -# Quality check: Display sample rows -print("\n=== QUALITY CHECK: Sample Calculations ===") -sample_rows <- sample(1:nrow(data), min(5, nrow(data))) - -for(i in sample_rows) { - print(sprintf("\nRow %d:", i)) - print(sprintf(" DGEN_past_5_Pref = %.2f, DGEN_past_10_Pref = %.2f", - data$DGEN_past_5_Pref[i], data$DGEN_past_10_Pref[i])) - print(sprintf(" → X5_10DGEN_past_pref = %.2f (expected: %.2f)", - data$X5_10DGEN_past_pref[i], - abs(data$DGEN_past_5_Pref[i] - data$DGEN_past_10_Pref[i]))) - - print(sprintf(" DGEN_fut_5_Pers = %.2f, DGEN_fut_10_Pers = %.2f", - data$DGEN_fut_5_Pers[i], data$DGEN_fut_10_Pers[i])) - print(sprintf(" → X5_10DGEN_fut_pers = %.2f (expected: %.2f)", - data$X5_10DGEN_fut_pers[i], - abs(data$DGEN_fut_5_Pers[i] - data$DGEN_fut_10_Pers[i]))) -} - -# Descriptive statistics -print("\n=== DESCRIPTIVE STATISTICS ===") -desc_stats <- data %>% - summarise(across(all_of(target_vars), - list(n = ~sum(!is.na(.)), - mean = ~round(mean(., na.rm = TRUE), 5), - sd = ~round(sd(., na.rm = TRUE), 5), - min = ~round(min(., na.rm = TRUE), 5), - max = ~round(max(., na.rm = TRUE), 5)), - .names = "{.col}_{.fn}")) - -print(t(desc_stats)) - -# Save to CSV -write.csv(data, "eohi2.csv", row.names = FALSE) - -print("\n=== PROCESSING COMPLETE ===") -print("Data saved to eohi2.csv") -print(paste("Total columns now:", ncol(data))) - diff --git a/.history/eohi2/dataP 08 - DGEN 510 vars_20251006195128.r b/.history/eohi2/dataP 08 - DGEN 510 vars_20251006195128.r deleted file mode 100644 index d50239d..0000000 --- a/.history/eohi2/dataP 08 - DGEN 510 vars_20251006195128.r +++ /dev/null @@ -1,95 +0,0 @@ -# Script 08: Create 5_10 DGEN Variables -# PURPOSE: Calculate absolute differences between 5-year and 10-year DGEN ratings -# for both Past and Future directions -# VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify source columns exist -source_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- source_vars[!source_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - stop(paste("ERROR: Missing source variables:", paste(missing_vars, collapse = ", "))) -} - -print("All source DGEN variables found!") - -# Calculate 5_10 DGEN variables (absolute differences between 5-year and 10-year) -# Formula: |DGEN_5 - DGEN_10| -# NOTE: Using X prefix because R adds it to column names starting with numbers - -# PAST direction -data$X5_10DGEN_past_pref <- abs(data$DGEN_past_5_Pref - data$DGEN_past_10_Pref) -data$X5_10DGEN_past_pers <- abs(data$DGEN_past_5_Pers - data$DGEN_past_10_Pers) -data$X5_10DGEN_past_val <- abs(data$DGEN_past_5_Val - data$DGEN_past_10_Val) - -# FUTURE direction -data$X5_10DGEN_fut_pref <- abs(data$DGEN_fut_5_Pref - data$DGEN_fut_10_Pref) -data$X5_10DGEN_fut_pers <- abs(data$DGEN_fut_5_Pers - data$DGEN_fut_10_Pers) -data$X5_10DGEN_fut_val <- abs(data$DGEN_fut_5_Val - data$DGEN_fut_10_Val) - -# Verify variables were created -target_vars <- c("X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -print("\n=== VARIABLES CREATED ===") -print(target_vars) - -# Check for missing values -for(var in target_vars) { - n_missing <- sum(is.na(data[[var]])) - pct_missing <- round(100 * n_missing / nrow(data), 2) - print(sprintf("%s: %d missing (%.2f%%)", var, n_missing, pct_missing)) -} - -# Quality check: Display sample rows -print("\n=== QUALITY CHECK: Sample Calculations ===") -sample_rows <- sample(1:nrow(data), min(5, nrow(data))) - -for(i in sample_rows) { - print(sprintf("\nRow %d:", i)) - print(sprintf(" DGEN_past_5_Pref = %.2f, DGEN_past_10_Pref = %.2f", - data$DGEN_past_5_Pref[i], data$DGEN_past_10_Pref[i])) - print(sprintf(" → X5_10DGEN_past_pref = %.2f (expected: %.2f)", - data$X5_10DGEN_past_pref[i], - abs(data$DGEN_past_5_Pref[i] - data$DGEN_past_10_Pref[i]))) - - print(sprintf(" DGEN_fut_5_Pers = %.2f, DGEN_fut_10_Pers = %.2f", - data$DGEN_fut_5_Pers[i], data$DGEN_fut_10_Pers[i])) - print(sprintf(" → X5_10DGEN_fut_pers = %.2f (expected: %.2f)", - data$X5_10DGEN_fut_pers[i], - abs(data$DGEN_fut_5_Pers[i] - data$DGEN_fut_10_Pers[i]))) -} - -# Descriptive statistics -print("\n=== DESCRIPTIVE STATISTICS ===") -desc_stats <- data %>% - summarise(across(all_of(target_vars), - list(n = ~sum(!is.na(.)), - mean = ~round(mean(., na.rm = TRUE), 5), - sd = ~round(sd(., na.rm = TRUE), 5), - min = ~round(min(., na.rm = TRUE), 5), - max = ~round(max(., na.rm = TRUE), 5)), - .names = "{.col}_{.fn}")) - -print(t(desc_stats)) - -# Save to CSV -write.csv(data, "eohi2.csv", row.names = FALSE) - -print("\n=== PROCESSING COMPLETE ===") -print("Data saved to eohi2.csv") -print(paste("Total columns now:", ncol(data))) - diff --git a/.history/eohi2/dataP 08 - DGEN 510 vars_20251006195318.r b/.history/eohi2/dataP 08 - DGEN 510 vars_20251006195318.r deleted file mode 100644 index d50239d..0000000 --- a/.history/eohi2/dataP 08 - DGEN 510 vars_20251006195318.r +++ /dev/null @@ -1,95 +0,0 @@ -# Script 08: Create 5_10 DGEN Variables -# PURPOSE: Calculate absolute differences between 5-year and 10-year DGEN ratings -# for both Past and Future directions -# VARIABLES CREATED: 6 total (3 domains × 2 time directions) - -library(tidyverse) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify source columns exist -source_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- source_vars[!source_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - stop(paste("ERROR: Missing source variables:", paste(missing_vars, collapse = ", "))) -} - -print("All source DGEN variables found!") - -# Calculate 5_10 DGEN variables (absolute differences between 5-year and 10-year) -# Formula: |DGEN_5 - DGEN_10| -# NOTE: Using X prefix because R adds it to column names starting with numbers - -# PAST direction -data$X5_10DGEN_past_pref <- abs(data$DGEN_past_5_Pref - data$DGEN_past_10_Pref) -data$X5_10DGEN_past_pers <- abs(data$DGEN_past_5_Pers - data$DGEN_past_10_Pers) -data$X5_10DGEN_past_val <- abs(data$DGEN_past_5_Val - data$DGEN_past_10_Val) - -# FUTURE direction -data$X5_10DGEN_fut_pref <- abs(data$DGEN_fut_5_Pref - data$DGEN_fut_10_Pref) -data$X5_10DGEN_fut_pers <- abs(data$DGEN_fut_5_Pers - data$DGEN_fut_10_Pers) -data$X5_10DGEN_fut_val <- abs(data$DGEN_fut_5_Val - data$DGEN_fut_10_Val) - -# Verify variables were created -target_vars <- c("X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -print("\n=== VARIABLES CREATED ===") -print(target_vars) - -# Check for missing values -for(var in target_vars) { - n_missing <- sum(is.na(data[[var]])) - pct_missing <- round(100 * n_missing / nrow(data), 2) - print(sprintf("%s: %d missing (%.2f%%)", var, n_missing, pct_missing)) -} - -# Quality check: Display sample rows -print("\n=== QUALITY CHECK: Sample Calculations ===") -sample_rows <- sample(1:nrow(data), min(5, nrow(data))) - -for(i in sample_rows) { - print(sprintf("\nRow %d:", i)) - print(sprintf(" DGEN_past_5_Pref = %.2f, DGEN_past_10_Pref = %.2f", - data$DGEN_past_5_Pref[i], data$DGEN_past_10_Pref[i])) - print(sprintf(" → X5_10DGEN_past_pref = %.2f (expected: %.2f)", - data$X5_10DGEN_past_pref[i], - abs(data$DGEN_past_5_Pref[i] - data$DGEN_past_10_Pref[i]))) - - print(sprintf(" DGEN_fut_5_Pers = %.2f, DGEN_fut_10_Pers = %.2f", - data$DGEN_fut_5_Pers[i], data$DGEN_fut_10_Pers[i])) - print(sprintf(" → X5_10DGEN_fut_pers = %.2f (expected: %.2f)", - data$X5_10DGEN_fut_pers[i], - abs(data$DGEN_fut_5_Pers[i] - data$DGEN_fut_10_Pers[i]))) -} - -# Descriptive statistics -print("\n=== DESCRIPTIVE STATISTICS ===") -desc_stats <- data %>% - summarise(across(all_of(target_vars), - list(n = ~sum(!is.na(.)), - mean = ~round(mean(., na.rm = TRUE), 5), - sd = ~round(sd(., na.rm = TRUE), 5), - min = ~round(min(., na.rm = TRUE), 5), - max = ~round(max(., na.rm = TRUE), 5)), - .names = "{.col}_{.fn}")) - -print(t(desc_stats)) - -# Save to CSV -write.csv(data, "eohi2.csv", row.names = FALSE) - -print("\n=== PROCESSING COMPLETE ===") -print("Data saved to eohi2.csv") -print(paste("Total columns now:", ncol(data))) - diff --git a/.history/eohi2/dataP 09 - interval x direction means_20251008113501.r b/.history/eohi2/dataP 09 - interval x direction means_20251008113501.r deleted file mode 100644 index 6bd02b7..0000000 --- a/.history/eohi2/dataP 09 - interval x direction means_20251008113501.r +++ /dev/null @@ -1,223 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load data -data <- read.csv("eohi2.csv") - -# Set 1: NPast_5_mean (15 variables) -data$NPast_5_mean <- rowMeans(data[, c( - "NPast_5_pref_read", "NPast_5_pref_music", "NPast_5_pref_TV", "NPast_5_pref_nap", "NPast_5_pref_travel", - "NPast_5_pers_extravert", "NPast_5_pers_critical", "NPast_5_pers_dependable", "NPast_5_pers_anxious", "NPast_5_pers_complex", - "NPast_5_val_obey", "NPast_5_val_trad", "NPast_5_val_opinion", "NPast_5_val_performance", "NPast_5_val_justice" -)], na.rm = TRUE) - -# Set 2: NPast_10_mean (15 variables) -data$NPast_10_mean <- rowMeans(data[, c( - "NPast_10_pref_read", "NPast_10_pref_music", "NPast_10_pref_TV", "NPast_10_pref_nap", "NPast_10_pref_travel", - "NPast_10_pers_extravert", "NPast_10_pers_critical", "NPast_10_pers_dependable", "NPast_10_pers_anxious", "NPast_10_pers_complex", - "NPast_10_val_obey", "NPast_10_val_trad", "NPast_10_val_opinion", "NPast_10_val_performance", "NPast_10_val_justice" -)], na.rm = TRUE) - -# Set 3: NFut_5_mean (15 variables) -data$NFut_5_mean <- rowMeans(data[, c( - "NFut_5_pref_read", "NFut_5_pref_music", "NFut_5_pref_TV", "NFut_5_pref_nap", "NFut_5_pref_travel", - "NFut_5_pers_extravert", "NFut_5_pers_critical", "NFut_5_pers_dependable", "NFut_5_pers_anxious", "NFut_5_pers_complex", - "NFut_5_val_obey", "NFut_5_val_trad", "NFut_5_val_opinion", "NFut_5_val_performance", "NFut_5_val_justice" -)], na.rm = TRUE) - -# Set 4: NFut_10_mean (15 variables) -data$NFut_10_mean <- rowMeans(data[, c( - "NFut_10_pref_read", "NFut_10_pref_music", "NFut_10_pref_TV", "NFut_10_pref_nap", "NFut_10_pref_travel", - "NFut_10_pers_extravert", "NFut_10_pers_critical", "NFut_10_pers_dependable", "NFut_10_pers_anxious", "NFut_10_pers_complex", - "NFut_10_val_obey", "NFut_10_val_trad", "NFut_10_val_opinion", "NFut_10_val_performance", "NFut_10_val_justice" -)], na.rm = TRUE) - -# Set 5: X5.10past_mean (15 variables) -data$X5.10past_mean <- rowMeans(data[, c( - "X5.10past_pref_read", "X5.10past_pref_music", "X5.10past_pref_TV", "X5.10past_pref_nap", "X5.10past_pref_travel", - "X5.10past_pers_extravert", "X5.10past_pers_critical", "X5.10past_pers_dependable", "X5.10past_pers_anxious", "X5.10past_pers_complex", - "X5.10past_val_obey", "X5.10past_val_trad", "X5.10past_val_opinion", "X5.10past_val_performance", "X5.10past_val_justice" -)], na.rm = TRUE) - -# Set 6: X5.10fut_mean (15 variables) -data$X5.10fut_mean <- rowMeans(data[, c( - "X5.10fut_pref_read", "X5.10fut_pref_music", "X5.10fut_pref_TV", "X5.10fut_pref_nap", "X5.10fut_pref_travel", - "X5.10fut_pers_extravert", "X5.10fut_pers_critical", "X5.10fut_pers_dependable", "X5.10fut_pers_anxious", "X5.10fut_pers_complex", - "X5.10fut_val_obey", "X5.10fut_val_trad", "X5.10fut_val_opinion", "X5.10fut_val_performance", "X5.10fut_val_justice" -)], na.rm = TRUE) - -# Set 7: NPast_global_mean (30 variables - NPast_5 + NPast_10) -data$NPast_global_mean <- rowMeans(data[, c( - "NPast_5_pref_read", "NPast_5_pref_music", "NPast_5_pref_TV", "NPast_5_pref_nap", "NPast_5_pref_travel", - "NPast_5_pers_extravert", "NPast_5_pers_critical", "NPast_5_pers_dependable", "NPast_5_pers_anxious", "NPast_5_pers_complex", - "NPast_5_val_obey", "NPast_5_val_trad", "NPast_5_val_opinion", "NPast_5_val_performance", "NPast_5_val_justice", - "NPast_10_pref_read", "NPast_10_pref_music", "NPast_10_pref_TV", "NPast_10_pref_nap", "NPast_10_pref_travel", - "NPast_10_pers_extravert", "NPast_10_pers_critical", "NPast_10_pers_dependable", "NPast_10_pers_anxious", "NPast_10_pers_complex", - "NPast_10_val_obey", "NPast_10_val_trad", "NPast_10_val_opinion", "NPast_10_val_performance", "NPast_10_val_justice" -)], na.rm = TRUE) - -# Set 8: NFut_global_mean (30 variables - NFut_5 + NFut_10) -data$NFut_global_mean <- rowMeans(data[, c( - "NFut_5_pref_read", "NFut_5_pref_music", "NFut_5_pref_TV", "NFut_5_pref_nap", "NFut_5_pref_travel", - "NFut_5_pers_extravert", "NFut_5_pers_critical", "NFut_5_pers_dependable", "NFut_5_pers_anxious", "NFut_5_pers_complex", - "NFut_5_val_obey", "NFut_5_val_trad", "NFut_5_val_opinion", "NFut_5_val_performance", "NFut_5_val_justice", - "NFut_10_pref_read", "NFut_10_pref_music", "NFut_10_pref_TV", "NFut_10_pref_nap", "NFut_10_pref_travel", - "NFut_10_pers_extravert", "NFut_10_pers_critical", "NFut_10_pers_dependable", "NFut_10_pers_anxious", "NFut_10_pers_complex", - "NFut_10_val_obey", "NFut_10_val_trad", "NFut_10_val_opinion", "NFut_10_val_performance", "NFut_10_val_justice" -)], na.rm = TRUE) - -# Set 9: X5.10_global_mean (30 variables - X5.10past + X5.10fut) -data$X5.10_global_mean <- rowMeans(data[, c( - "X5.10past_pref_read", "X5.10past_pref_music", "X5.10past_pref_TV", "X5.10past_pref_nap", "X5.10past_pref_travel", - "X5.10past_pers_extravert", "X5.10past_pers_critical", "X5.10past_pers_dependable", "X5.10past_pers_anxious", "X5.10past_pers_complex", - "X5.10past_val_obey", "X5.10past_val_trad", "X5.10past_val_opinion", "X5.10past_val_performance", "X5.10past_val_justice", - "X5.10fut_pref_read", "X5.10fut_pref_music", "X5.10fut_pref_TV", "X5.10fut_pref_nap", "X5.10fut_pref_travel", - "X5.10fut_pers_extravert", "X5.10fut_pers_critical", "X5.10fut_pers_dependable", "X5.10fut_pers_anxious", "X5.10fut_pers_complex", - "X5.10fut_val_obey", "X5.10fut_val_trad", "X5.10fut_val_opinion", "X5.10fut_val_performance", "X5.10fut_val_justice" -)], na.rm = TRUE) - -# Set 10: N5_global_mean (30 variables - NPast_5 + NFut_5) -data$N5_global_mean <- rowMeans(data[, c( - "NPast_5_pref_read", "NPast_5_pref_music", "NPast_5_pref_TV", "NPast_5_pref_nap", "NPast_5_pref_travel", - "NPast_5_pers_extravert", "NPast_5_pers_critical", "NPast_5_pers_dependable", "NPast_5_pers_anxious", "NPast_5_pers_complex", - "NPast_5_val_obey", "NPast_5_val_trad", "NPast_5_val_opinion", "NPast_5_val_performance", "NPast_5_val_justice", - "NFut_5_pref_read", "NFut_5_pref_music", "NFut_5_pref_TV", "NFut_5_pref_nap", "NFut_5_pref_travel", - "NFut_5_pers_extravert", "NFut_5_pers_critical", "NFut_5_pers_dependable", "NFut_5_pers_anxious", "NFut_5_pers_complex", - "NFut_5_val_obey", "NFut_5_val_trad", "NFut_5_val_opinion", "NFut_5_val_performance", "NFut_5_val_justice" -)], na.rm = TRUE) - -# Set 11: N10_global_mean (30 variables - NPast_10 + NFut_10) -data$N10_global_mean <- rowMeans(data[, c( - "NPast_10_pref_read", "NPast_10_pref_music", "NPast_10_pref_TV", "NPast_10_pref_nap", "NPast_10_pref_travel", - "NPast_10_pers_extravert", "NPast_10_pers_critical", "NPast_10_pers_dependable", "NPast_10_pers_anxious", "NPast_10_pers_complex", - "NPast_10_val_obey", "NPast_10_val_trad", "NPast_10_val_opinion", "NPast_10_val_performance", "NPast_10_val_justice", - "NFut_10_pref_read", "NFut_10_pref_music", "NFut_10_pref_TV", "NFut_10_pref_nap", "NFut_10_pref_travel", - "NFut_10_pers_extravert", "NFut_10_pers_critical", "NFut_10_pers_dependable", "NFut_10_pers_anxious", "NFut_10_pers_complex", - "NFut_10_val_obey", "NFut_10_val_trad", "NFut_10_val_opinion", "NFut_10_val_performance", "NFut_10_val_justice" -)], na.rm = TRUE) - -# Save the data -write.csv(data, "eohi2.csv", row.names = FALSE) - -# ===== QA CODE: Check first 5 rows ===== -cat("\n=== QUALITY ASSURANCE: Checking calculations for first 5 rows ===\n\n") - -for (i in 1:min(5, nrow(data))) { - cat("--- Row", i, "---\n") - - # Set 1: NPast_5_mean - calc1 <- mean(as.numeric(data[i, c( - "NPast_5_pref_read", "NPast_5_pref_music", "NPast_5_pref_TV", "NPast_5_pref_nap", "NPast_5_pref_travel", - "NPast_5_pers_extravert", "NPast_5_pers_critical", "NPast_5_pers_dependable", "NPast_5_pers_anxious", "NPast_5_pers_complex", - "NPast_5_val_obey", "NPast_5_val_trad", "NPast_5_val_opinion", "NPast_5_val_performance", "NPast_5_val_justice" - )]), na.rm = TRUE) - cat("NPast_5_mean: Calculated =", calc1, "| Stored =", data$NPast_5_mean[i], - "| Match:", isTRUE(all.equal(calc1, data$NPast_5_mean[i])), "\n") - - # Set 2: NPast_10_mean - calc2 <- mean(as.numeric(data[i, c( - "NPast_10_pref_read", "NPast_10_pref_music", "NPast_10_pref_TV", "NPast_10_pref_nap", "NPast_10_pref_travel", - "NPast_10_pers_extravert", "NPast_10_pers_critical", "NPast_10_pers_dependable", "NPast_10_pers_anxious", "NPast_10_pers_complex", - "NPast_10_val_obey", "NPast_10_val_trad", "NPast_10_val_opinion", "NPast_10_val_performance", "NPast_10_val_justice" - )]), na.rm = TRUE) - cat("NPast_10_mean: Calculated =", calc2, "| Stored =", data$NPast_10_mean[i], - "| Match:", isTRUE(all.equal(calc2, data$NPast_10_mean[i])), "\n") - - # Set 3: NFut_5_mean - calc3 <- mean(as.numeric(data[i, c( - "NFut_5_pref_read", "NFut_5_pref_music", "NFut_5_pref_TV", "NFut_5_pref_nap", "NFut_5_pref_travel", - "NFut_5_pers_extravert", "NFut_5_pers_critical", "NFut_5_pers_dependable", "NFut_5_pers_anxious", "NFut_5_pers_complex", - "NFut_5_val_obey", "NFut_5_val_trad", "NFut_5_val_opinion", "NFut_5_val_performance", "NFut_5_val_justice" - )]), na.rm = TRUE) - cat("NFut_5_mean: Calculated =", calc3, "| Stored =", data$NFut_5_mean[i], - "| Match:", isTRUE(all.equal(calc3, data$NFut_5_mean[i])), "\n") - - # Set 4: NFut_10_mean - calc4 <- mean(as.numeric(data[i, c( - "NFut_10_pref_read", "NFut_10_pref_music", "NFut_10_pref_TV", "NFut_10_pref_nap", "NFut_10_pref_travel", - "NFut_10_pers_extravert", "NFut_10_pers_critical", "NFut_10_pers_dependable", "NFut_10_pers_anxious", "NFut_10_pers_complex", - "NFut_10_val_obey", "NFut_10_val_trad", "NFut_10_val_opinion", "NFut_10_val_performance", "NFut_10_val_justice" - )]), na.rm = TRUE) - cat("NFut_10_mean: Calculated =", calc4, "| Stored =", data$NFut_10_mean[i], - "| Match:", isTRUE(all.equal(calc4, data$NFut_10_mean[i])), "\n") - - # Set 5: X5.10past_mean - calc5 <- mean(as.numeric(data[i, c( - "X5.10past_pref_read", "X5.10past_pref_music", "X5.10past_pref_TV", "X5.10past_pref_nap", "X5.10past_pref_travel", - "X5.10past_pers_extravert", "X5.10past_pers_critical", "X5.10past_pers_dependable", "X5.10past_pers_anxious", "X5.10past_pers_complex", - "X5.10past_val_obey", "X5.10past_val_trad", "X5.10past_val_opinion", "X5.10past_val_performance", "X5.10past_val_justice" - )]), na.rm = TRUE) - cat("X5.10past_mean: Calculated =", calc5, "| Stored =", data$X5.10past_mean[i], - "| Match:", isTRUE(all.equal(calc5, data$X5.10past_mean[i])), "\n") - - # Set 6: X5.10fut_mean - calc6 <- mean(as.numeric(data[i, c( - "X5.10fut_pref_read", "X5.10fut_pref_music", "X5.10fut_pref_TV", "X5.10fut_pref_nap", "X5.10fut_pref_travel", - "X5.10fut_pers_extravert", "X5.10fut_pers_critical", "X5.10fut_pers_dependable", "X5.10fut_pers_anxious", "X5.10fut_pers_complex", - "X5.10fut_val_obey", "X5.10fut_val_trad", "X5.10fut_val_opinion", "X5.10fut_val_performance", "X5.10fut_val_justice" - )]), na.rm = TRUE) - cat("X5.10fut_mean: Calculated =", calc6, "| Stored =", data$X5.10fut_mean[i], - "| Match:", isTRUE(all.equal(calc6, data$X5.10fut_mean[i])), "\n") - - # Set 7: NPast_global_mean - calc7 <- mean(as.numeric(data[i, c( - "NPast_5_pref_read", "NPast_5_pref_music", "NPast_5_pref_TV", "NPast_5_pref_nap", "NPast_5_pref_travel", - "NPast_5_pers_extravert", "NPast_5_pers_critical", "NPast_5_pers_dependable", "NPast_5_pers_anxious", "NPast_5_pers_complex", - "NPast_5_val_obey", "NPast_5_val_trad", "NPast_5_val_opinion", "NPast_5_val_performance", "NPast_5_val_justice", - "NPast_10_pref_read", "NPast_10_pref_music", "NPast_10_pref_TV", "NPast_10_pref_nap", "NPast_10_pref_travel", - "NPast_10_pers_extravert", "NPast_10_pers_critical", "NPast_10_pers_dependable", "NPast_10_pers_anxious", "NPast_10_pers_complex", - "NPast_10_val_obey", "NPast_10_val_trad", "NPast_10_val_opinion", "NPast_10_val_performance", "NPast_10_val_justice" - )]), na.rm = TRUE) - cat("NPast_global_mean: Calculated =", calc7, "| Stored =", data$NPast_global_mean[i], - "| Match:", isTRUE(all.equal(calc7, data$NPast_global_mean[i])), "\n") - - # Set 8: NFut_global_mean - calc8 <- mean(as.numeric(data[i, c( - "NFut_5_pref_read", "NFut_5_pref_music", "NFut_5_pref_TV", "NFut_5_pref_nap", "NFut_5_pref_travel", - "NFut_5_pers_extravert", "NFut_5_pers_critical", "NFut_5_pers_dependable", "NFut_5_pers_anxious", "NFut_5_pers_complex", - "NFut_5_val_obey", "NFut_5_val_trad", "NFut_5_val_opinion", "NFut_5_val_performance", "NFut_5_val_justice", - "NFut_10_pref_read", "NFut_10_pref_music", "NFut_10_pref_TV", "NFut_10_pref_nap", "NFut_10_pref_travel", - "NFut_10_pers_extravert", "NFut_10_pers_critical", "NFut_10_pers_dependable", "NFut_10_pers_anxious", "NFut_10_pers_complex", - "NFut_10_val_obey", "NFut_10_val_trad", "NFut_10_val_opinion", "NFut_10_val_performance", "NFut_10_val_justice" - )]), na.rm = TRUE) - cat("NFut_global_mean: Calculated =", calc8, "| Stored =", data$NFut_global_mean[i], - "| Match:", isTRUE(all.equal(calc8, data$NFut_global_mean[i])), "\n") - - # Set 9: X5.10_global_mean - calc9 <- mean(as.numeric(data[i, c( - "X5.10past_pref_read", "X5.10past_pref_music", "X5.10past_pref_TV", "X5.10past_pref_nap", "X5.10past_pref_travel", - "X5.10past_pers_extravert", "X5.10past_pers_critical", "X5.10past_pers_dependable", "X5.10past_pers_anxious", "X5.10past_pers_complex", - "X5.10past_val_obey", "X5.10past_val_trad", "X5.10past_val_opinion", "X5.10past_val_performance", "X5.10past_val_justice", - "X5.10fut_pref_read", "X5.10fut_pref_music", "X5.10fut_pref_TV", "X5.10fut_pref_nap", "X5.10fut_pref_travel", - "X5.10fut_pers_extravert", "X5.10fut_pers_critical", "X5.10fut_pers_dependable", "X5.10fut_pers_anxious", "X5.10fut_pers_complex", - "X5.10fut_val_obey", "X5.10fut_val_trad", "X5.10fut_val_opinion", "X5.10fut_val_performance", "X5.10fut_val_justice" - )]), na.rm = TRUE) - cat("X5.10_global_mean: Calculated =", calc9, "| Stored =", data$X5.10_global_mean[i], - "| Match:", isTRUE(all.equal(calc9, data$X5.10_global_mean[i])), "\n") - - # Set 10: N5_global_mean - calc10 <- mean(as.numeric(data[i, c( - "NPast_5_pref_read", "NPast_5_pref_music", "NPast_5_pref_TV", "NPast_5_pref_nap", "NPast_5_pref_travel", - "NPast_5_pers_extravert", "NPast_5_pers_critical", "NPast_5_pers_dependable", "NPast_5_pers_anxious", "NPast_5_pers_complex", - "NPast_5_val_obey", "NPast_5_val_trad", "NPast_5_val_opinion", "NPast_5_val_performance", "NPast_5_val_justice", - "NFut_5_pref_read", "NFut_5_pref_music", "NFut_5_pref_TV", "NFut_5_pref_nap", "NFut_5_pref_travel", - "NFut_5_pers_extravert", "NFut_5_pers_critical", "NFut_5_pers_dependable", "NFut_5_pers_anxious", "NFut_5_pers_complex", - "NFut_5_val_obey", "NFut_5_val_trad", "NFut_5_val_opinion", "NFut_5_val_performance", "NFut_5_val_justice" - )]), na.rm = TRUE) - cat("N5_global_mean: Calculated =", calc10, "| Stored =", data$N5_global_mean[i], - "| Match:", isTRUE(all.equal(calc10, data$N5_global_mean[i])), "\n") - - # Set 11: N10_global_mean - calc11 <- mean(as.numeric(data[i, c( - "NPast_10_pref_read", "NPast_10_pref_music", "NPast_10_pref_TV", "NPast_10_pref_nap", "NPast_10_pref_travel", - "NPast_10_pers_extravert", "NPast_10_pers_critical", "NPast_10_pers_dependable", "NPast_10_pers_anxious", "NPast_10_pers_complex", - "NPast_10_val_obey", "NPast_10_val_trad", "NPast_10_val_opinion", "NPast_10_val_performance", "NPast_10_val_justice", - "NFut_10_pref_read", "NFut_10_pref_music", "NFut_10_pref_TV", "NFut_10_pref_nap", "NFut_10_pref_travel", - "NFut_10_pers_extravert", "NFut_10_pers_critical", "NFut_10_pers_dependable", "NFut_10_pers_anxious", "NFut_10_pers_complex", - "NFut_10_val_obey", "NFut_10_val_trad", "NFut_10_val_opinion", "NFut_10_val_performance", "NFut_10_val_justice" - )]), na.rm = TRUE) - cat("N10_global_mean: Calculated =", calc11, "| Stored =", data$N10_global_mean[i], - "| Match:", isTRUE(all.equal(calc11, data$N10_global_mean[i])), "\n\n") -} - -cat("=== QA CHECK COMPLETE ===\n") diff --git a/.history/eohi2/dataP 09 - interval x direction means_20251008113518.r b/.history/eohi2/dataP 09 - interval x direction means_20251008113518.r deleted file mode 100644 index 6bd02b7..0000000 --- a/.history/eohi2/dataP 09 - interval x direction means_20251008113518.r +++ /dev/null @@ -1,223 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load data -data <- read.csv("eohi2.csv") - -# Set 1: NPast_5_mean (15 variables) -data$NPast_5_mean <- rowMeans(data[, c( - "NPast_5_pref_read", "NPast_5_pref_music", "NPast_5_pref_TV", "NPast_5_pref_nap", "NPast_5_pref_travel", - "NPast_5_pers_extravert", "NPast_5_pers_critical", "NPast_5_pers_dependable", "NPast_5_pers_anxious", "NPast_5_pers_complex", - "NPast_5_val_obey", "NPast_5_val_trad", "NPast_5_val_opinion", "NPast_5_val_performance", "NPast_5_val_justice" -)], na.rm = TRUE) - -# Set 2: NPast_10_mean (15 variables) -data$NPast_10_mean <- rowMeans(data[, c( - "NPast_10_pref_read", "NPast_10_pref_music", "NPast_10_pref_TV", "NPast_10_pref_nap", "NPast_10_pref_travel", - "NPast_10_pers_extravert", "NPast_10_pers_critical", "NPast_10_pers_dependable", "NPast_10_pers_anxious", "NPast_10_pers_complex", - "NPast_10_val_obey", "NPast_10_val_trad", "NPast_10_val_opinion", "NPast_10_val_performance", "NPast_10_val_justice" -)], na.rm = TRUE) - -# Set 3: NFut_5_mean (15 variables) -data$NFut_5_mean <- rowMeans(data[, c( - "NFut_5_pref_read", "NFut_5_pref_music", "NFut_5_pref_TV", "NFut_5_pref_nap", "NFut_5_pref_travel", - "NFut_5_pers_extravert", "NFut_5_pers_critical", "NFut_5_pers_dependable", "NFut_5_pers_anxious", "NFut_5_pers_complex", - "NFut_5_val_obey", "NFut_5_val_trad", "NFut_5_val_opinion", "NFut_5_val_performance", "NFut_5_val_justice" -)], na.rm = TRUE) - -# Set 4: NFut_10_mean (15 variables) -data$NFut_10_mean <- rowMeans(data[, c( - "NFut_10_pref_read", "NFut_10_pref_music", "NFut_10_pref_TV", "NFut_10_pref_nap", "NFut_10_pref_travel", - "NFut_10_pers_extravert", "NFut_10_pers_critical", "NFut_10_pers_dependable", "NFut_10_pers_anxious", "NFut_10_pers_complex", - "NFut_10_val_obey", "NFut_10_val_trad", "NFut_10_val_opinion", "NFut_10_val_performance", "NFut_10_val_justice" -)], na.rm = TRUE) - -# Set 5: X5.10past_mean (15 variables) -data$X5.10past_mean <- rowMeans(data[, c( - "X5.10past_pref_read", "X5.10past_pref_music", "X5.10past_pref_TV", "X5.10past_pref_nap", "X5.10past_pref_travel", - "X5.10past_pers_extravert", "X5.10past_pers_critical", "X5.10past_pers_dependable", "X5.10past_pers_anxious", "X5.10past_pers_complex", - "X5.10past_val_obey", "X5.10past_val_trad", "X5.10past_val_opinion", "X5.10past_val_performance", "X5.10past_val_justice" -)], na.rm = TRUE) - -# Set 6: X5.10fut_mean (15 variables) -data$X5.10fut_mean <- rowMeans(data[, c( - "X5.10fut_pref_read", "X5.10fut_pref_music", "X5.10fut_pref_TV", "X5.10fut_pref_nap", "X5.10fut_pref_travel", - "X5.10fut_pers_extravert", "X5.10fut_pers_critical", "X5.10fut_pers_dependable", "X5.10fut_pers_anxious", "X5.10fut_pers_complex", - "X5.10fut_val_obey", "X5.10fut_val_trad", "X5.10fut_val_opinion", "X5.10fut_val_performance", "X5.10fut_val_justice" -)], na.rm = TRUE) - -# Set 7: NPast_global_mean (30 variables - NPast_5 + NPast_10) -data$NPast_global_mean <- rowMeans(data[, c( - "NPast_5_pref_read", "NPast_5_pref_music", "NPast_5_pref_TV", "NPast_5_pref_nap", "NPast_5_pref_travel", - "NPast_5_pers_extravert", "NPast_5_pers_critical", "NPast_5_pers_dependable", "NPast_5_pers_anxious", "NPast_5_pers_complex", - "NPast_5_val_obey", "NPast_5_val_trad", "NPast_5_val_opinion", "NPast_5_val_performance", "NPast_5_val_justice", - "NPast_10_pref_read", "NPast_10_pref_music", "NPast_10_pref_TV", "NPast_10_pref_nap", "NPast_10_pref_travel", - "NPast_10_pers_extravert", "NPast_10_pers_critical", "NPast_10_pers_dependable", "NPast_10_pers_anxious", "NPast_10_pers_complex", - "NPast_10_val_obey", "NPast_10_val_trad", "NPast_10_val_opinion", "NPast_10_val_performance", "NPast_10_val_justice" -)], na.rm = TRUE) - -# Set 8: NFut_global_mean (30 variables - NFut_5 + NFut_10) -data$NFut_global_mean <- rowMeans(data[, c( - "NFut_5_pref_read", "NFut_5_pref_music", "NFut_5_pref_TV", "NFut_5_pref_nap", "NFut_5_pref_travel", - "NFut_5_pers_extravert", "NFut_5_pers_critical", "NFut_5_pers_dependable", "NFut_5_pers_anxious", "NFut_5_pers_complex", - "NFut_5_val_obey", "NFut_5_val_trad", "NFut_5_val_opinion", "NFut_5_val_performance", "NFut_5_val_justice", - "NFut_10_pref_read", "NFut_10_pref_music", "NFut_10_pref_TV", "NFut_10_pref_nap", "NFut_10_pref_travel", - "NFut_10_pers_extravert", "NFut_10_pers_critical", "NFut_10_pers_dependable", "NFut_10_pers_anxious", "NFut_10_pers_complex", - "NFut_10_val_obey", "NFut_10_val_trad", "NFut_10_val_opinion", "NFut_10_val_performance", "NFut_10_val_justice" -)], na.rm = TRUE) - -# Set 9: X5.10_global_mean (30 variables - X5.10past + X5.10fut) -data$X5.10_global_mean <- rowMeans(data[, c( - "X5.10past_pref_read", "X5.10past_pref_music", "X5.10past_pref_TV", "X5.10past_pref_nap", "X5.10past_pref_travel", - "X5.10past_pers_extravert", "X5.10past_pers_critical", "X5.10past_pers_dependable", "X5.10past_pers_anxious", "X5.10past_pers_complex", - "X5.10past_val_obey", "X5.10past_val_trad", "X5.10past_val_opinion", "X5.10past_val_performance", "X5.10past_val_justice", - "X5.10fut_pref_read", "X5.10fut_pref_music", "X5.10fut_pref_TV", "X5.10fut_pref_nap", "X5.10fut_pref_travel", - "X5.10fut_pers_extravert", "X5.10fut_pers_critical", "X5.10fut_pers_dependable", "X5.10fut_pers_anxious", "X5.10fut_pers_complex", - "X5.10fut_val_obey", "X5.10fut_val_trad", "X5.10fut_val_opinion", "X5.10fut_val_performance", "X5.10fut_val_justice" -)], na.rm = TRUE) - -# Set 10: N5_global_mean (30 variables - NPast_5 + NFut_5) -data$N5_global_mean <- rowMeans(data[, c( - "NPast_5_pref_read", "NPast_5_pref_music", "NPast_5_pref_TV", "NPast_5_pref_nap", "NPast_5_pref_travel", - "NPast_5_pers_extravert", "NPast_5_pers_critical", "NPast_5_pers_dependable", "NPast_5_pers_anxious", "NPast_5_pers_complex", - "NPast_5_val_obey", "NPast_5_val_trad", "NPast_5_val_opinion", "NPast_5_val_performance", "NPast_5_val_justice", - "NFut_5_pref_read", "NFut_5_pref_music", "NFut_5_pref_TV", "NFut_5_pref_nap", "NFut_5_pref_travel", - "NFut_5_pers_extravert", "NFut_5_pers_critical", "NFut_5_pers_dependable", "NFut_5_pers_anxious", "NFut_5_pers_complex", - "NFut_5_val_obey", "NFut_5_val_trad", "NFut_5_val_opinion", "NFut_5_val_performance", "NFut_5_val_justice" -)], na.rm = TRUE) - -# Set 11: N10_global_mean (30 variables - NPast_10 + NFut_10) -data$N10_global_mean <- rowMeans(data[, c( - "NPast_10_pref_read", "NPast_10_pref_music", "NPast_10_pref_TV", "NPast_10_pref_nap", "NPast_10_pref_travel", - "NPast_10_pers_extravert", "NPast_10_pers_critical", "NPast_10_pers_dependable", "NPast_10_pers_anxious", "NPast_10_pers_complex", - "NPast_10_val_obey", "NPast_10_val_trad", "NPast_10_val_opinion", "NPast_10_val_performance", "NPast_10_val_justice", - "NFut_10_pref_read", "NFut_10_pref_music", "NFut_10_pref_TV", "NFut_10_pref_nap", "NFut_10_pref_travel", - "NFut_10_pers_extravert", "NFut_10_pers_critical", "NFut_10_pers_dependable", "NFut_10_pers_anxious", "NFut_10_pers_complex", - "NFut_10_val_obey", "NFut_10_val_trad", "NFut_10_val_opinion", "NFut_10_val_performance", "NFut_10_val_justice" -)], na.rm = TRUE) - -# Save the data -write.csv(data, "eohi2.csv", row.names = FALSE) - -# ===== QA CODE: Check first 5 rows ===== -cat("\n=== QUALITY ASSURANCE: Checking calculations for first 5 rows ===\n\n") - -for (i in 1:min(5, nrow(data))) { - cat("--- Row", i, "---\n") - - # Set 1: NPast_5_mean - calc1 <- mean(as.numeric(data[i, c( - "NPast_5_pref_read", "NPast_5_pref_music", "NPast_5_pref_TV", "NPast_5_pref_nap", "NPast_5_pref_travel", - "NPast_5_pers_extravert", "NPast_5_pers_critical", "NPast_5_pers_dependable", "NPast_5_pers_anxious", "NPast_5_pers_complex", - "NPast_5_val_obey", "NPast_5_val_trad", "NPast_5_val_opinion", "NPast_5_val_performance", "NPast_5_val_justice" - )]), na.rm = TRUE) - cat("NPast_5_mean: Calculated =", calc1, "| Stored =", data$NPast_5_mean[i], - "| Match:", isTRUE(all.equal(calc1, data$NPast_5_mean[i])), "\n") - - # Set 2: NPast_10_mean - calc2 <- mean(as.numeric(data[i, c( - "NPast_10_pref_read", "NPast_10_pref_music", "NPast_10_pref_TV", "NPast_10_pref_nap", "NPast_10_pref_travel", - "NPast_10_pers_extravert", "NPast_10_pers_critical", "NPast_10_pers_dependable", "NPast_10_pers_anxious", "NPast_10_pers_complex", - "NPast_10_val_obey", "NPast_10_val_trad", "NPast_10_val_opinion", "NPast_10_val_performance", "NPast_10_val_justice" - )]), na.rm = TRUE) - cat("NPast_10_mean: Calculated =", calc2, "| Stored =", data$NPast_10_mean[i], - "| Match:", isTRUE(all.equal(calc2, data$NPast_10_mean[i])), "\n") - - # Set 3: NFut_5_mean - calc3 <- mean(as.numeric(data[i, c( - "NFut_5_pref_read", "NFut_5_pref_music", "NFut_5_pref_TV", "NFut_5_pref_nap", "NFut_5_pref_travel", - "NFut_5_pers_extravert", "NFut_5_pers_critical", "NFut_5_pers_dependable", "NFut_5_pers_anxious", "NFut_5_pers_complex", - "NFut_5_val_obey", "NFut_5_val_trad", "NFut_5_val_opinion", "NFut_5_val_performance", "NFut_5_val_justice" - )]), na.rm = TRUE) - cat("NFut_5_mean: Calculated =", calc3, "| Stored =", data$NFut_5_mean[i], - "| Match:", isTRUE(all.equal(calc3, data$NFut_5_mean[i])), "\n") - - # Set 4: NFut_10_mean - calc4 <- mean(as.numeric(data[i, c( - "NFut_10_pref_read", "NFut_10_pref_music", "NFut_10_pref_TV", "NFut_10_pref_nap", "NFut_10_pref_travel", - "NFut_10_pers_extravert", "NFut_10_pers_critical", "NFut_10_pers_dependable", "NFut_10_pers_anxious", "NFut_10_pers_complex", - "NFut_10_val_obey", "NFut_10_val_trad", "NFut_10_val_opinion", "NFut_10_val_performance", "NFut_10_val_justice" - )]), na.rm = TRUE) - cat("NFut_10_mean: Calculated =", calc4, "| Stored =", data$NFut_10_mean[i], - "| Match:", isTRUE(all.equal(calc4, data$NFut_10_mean[i])), "\n") - - # Set 5: X5.10past_mean - calc5 <- mean(as.numeric(data[i, c( - "X5.10past_pref_read", "X5.10past_pref_music", "X5.10past_pref_TV", "X5.10past_pref_nap", "X5.10past_pref_travel", - "X5.10past_pers_extravert", "X5.10past_pers_critical", "X5.10past_pers_dependable", "X5.10past_pers_anxious", "X5.10past_pers_complex", - "X5.10past_val_obey", "X5.10past_val_trad", "X5.10past_val_opinion", "X5.10past_val_performance", "X5.10past_val_justice" - )]), na.rm = TRUE) - cat("X5.10past_mean: Calculated =", calc5, "| Stored =", data$X5.10past_mean[i], - "| Match:", isTRUE(all.equal(calc5, data$X5.10past_mean[i])), "\n") - - # Set 6: X5.10fut_mean - calc6 <- mean(as.numeric(data[i, c( - "X5.10fut_pref_read", "X5.10fut_pref_music", "X5.10fut_pref_TV", "X5.10fut_pref_nap", "X5.10fut_pref_travel", - "X5.10fut_pers_extravert", "X5.10fut_pers_critical", "X5.10fut_pers_dependable", "X5.10fut_pers_anxious", "X5.10fut_pers_complex", - "X5.10fut_val_obey", "X5.10fut_val_trad", "X5.10fut_val_opinion", "X5.10fut_val_performance", "X5.10fut_val_justice" - )]), na.rm = TRUE) - cat("X5.10fut_mean: Calculated =", calc6, "| Stored =", data$X5.10fut_mean[i], - "| Match:", isTRUE(all.equal(calc6, data$X5.10fut_mean[i])), "\n") - - # Set 7: NPast_global_mean - calc7 <- mean(as.numeric(data[i, c( - "NPast_5_pref_read", "NPast_5_pref_music", "NPast_5_pref_TV", "NPast_5_pref_nap", "NPast_5_pref_travel", - "NPast_5_pers_extravert", "NPast_5_pers_critical", "NPast_5_pers_dependable", "NPast_5_pers_anxious", "NPast_5_pers_complex", - "NPast_5_val_obey", "NPast_5_val_trad", "NPast_5_val_opinion", "NPast_5_val_performance", "NPast_5_val_justice", - "NPast_10_pref_read", "NPast_10_pref_music", "NPast_10_pref_TV", "NPast_10_pref_nap", "NPast_10_pref_travel", - "NPast_10_pers_extravert", "NPast_10_pers_critical", "NPast_10_pers_dependable", "NPast_10_pers_anxious", "NPast_10_pers_complex", - "NPast_10_val_obey", "NPast_10_val_trad", "NPast_10_val_opinion", "NPast_10_val_performance", "NPast_10_val_justice" - )]), na.rm = TRUE) - cat("NPast_global_mean: Calculated =", calc7, "| Stored =", data$NPast_global_mean[i], - "| Match:", isTRUE(all.equal(calc7, data$NPast_global_mean[i])), "\n") - - # Set 8: NFut_global_mean - calc8 <- mean(as.numeric(data[i, c( - "NFut_5_pref_read", "NFut_5_pref_music", "NFut_5_pref_TV", "NFut_5_pref_nap", "NFut_5_pref_travel", - "NFut_5_pers_extravert", "NFut_5_pers_critical", "NFut_5_pers_dependable", "NFut_5_pers_anxious", "NFut_5_pers_complex", - "NFut_5_val_obey", "NFut_5_val_trad", "NFut_5_val_opinion", "NFut_5_val_performance", "NFut_5_val_justice", - "NFut_10_pref_read", "NFut_10_pref_music", "NFut_10_pref_TV", "NFut_10_pref_nap", "NFut_10_pref_travel", - "NFut_10_pers_extravert", "NFut_10_pers_critical", "NFut_10_pers_dependable", "NFut_10_pers_anxious", "NFut_10_pers_complex", - "NFut_10_val_obey", "NFut_10_val_trad", "NFut_10_val_opinion", "NFut_10_val_performance", "NFut_10_val_justice" - )]), na.rm = TRUE) - cat("NFut_global_mean: Calculated =", calc8, "| Stored =", data$NFut_global_mean[i], - "| Match:", isTRUE(all.equal(calc8, data$NFut_global_mean[i])), "\n") - - # Set 9: X5.10_global_mean - calc9 <- mean(as.numeric(data[i, c( - "X5.10past_pref_read", "X5.10past_pref_music", "X5.10past_pref_TV", "X5.10past_pref_nap", "X5.10past_pref_travel", - "X5.10past_pers_extravert", "X5.10past_pers_critical", "X5.10past_pers_dependable", "X5.10past_pers_anxious", "X5.10past_pers_complex", - "X5.10past_val_obey", "X5.10past_val_trad", "X5.10past_val_opinion", "X5.10past_val_performance", "X5.10past_val_justice", - "X5.10fut_pref_read", "X5.10fut_pref_music", "X5.10fut_pref_TV", "X5.10fut_pref_nap", "X5.10fut_pref_travel", - "X5.10fut_pers_extravert", "X5.10fut_pers_critical", "X5.10fut_pers_dependable", "X5.10fut_pers_anxious", "X5.10fut_pers_complex", - "X5.10fut_val_obey", "X5.10fut_val_trad", "X5.10fut_val_opinion", "X5.10fut_val_performance", "X5.10fut_val_justice" - )]), na.rm = TRUE) - cat("X5.10_global_mean: Calculated =", calc9, "| Stored =", data$X5.10_global_mean[i], - "| Match:", isTRUE(all.equal(calc9, data$X5.10_global_mean[i])), "\n") - - # Set 10: N5_global_mean - calc10 <- mean(as.numeric(data[i, c( - "NPast_5_pref_read", "NPast_5_pref_music", "NPast_5_pref_TV", "NPast_5_pref_nap", "NPast_5_pref_travel", - "NPast_5_pers_extravert", "NPast_5_pers_critical", "NPast_5_pers_dependable", "NPast_5_pers_anxious", "NPast_5_pers_complex", - "NPast_5_val_obey", "NPast_5_val_trad", "NPast_5_val_opinion", "NPast_5_val_performance", "NPast_5_val_justice", - "NFut_5_pref_read", "NFut_5_pref_music", "NFut_5_pref_TV", "NFut_5_pref_nap", "NFut_5_pref_travel", - "NFut_5_pers_extravert", "NFut_5_pers_critical", "NFut_5_pers_dependable", "NFut_5_pers_anxious", "NFut_5_pers_complex", - "NFut_5_val_obey", "NFut_5_val_trad", "NFut_5_val_opinion", "NFut_5_val_performance", "NFut_5_val_justice" - )]), na.rm = TRUE) - cat("N5_global_mean: Calculated =", calc10, "| Stored =", data$N5_global_mean[i], - "| Match:", isTRUE(all.equal(calc10, data$N5_global_mean[i])), "\n") - - # Set 11: N10_global_mean - calc11 <- mean(as.numeric(data[i, c( - "NPast_10_pref_read", "NPast_10_pref_music", "NPast_10_pref_TV", "NPast_10_pref_nap", "NPast_10_pref_travel", - "NPast_10_pers_extravert", "NPast_10_pers_critical", "NPast_10_pers_dependable", "NPast_10_pers_anxious", "NPast_10_pers_complex", - "NPast_10_val_obey", "NPast_10_val_trad", "NPast_10_val_opinion", "NPast_10_val_performance", "NPast_10_val_justice", - "NFut_10_pref_read", "NFut_10_pref_music", "NFut_10_pref_TV", "NFut_10_pref_nap", "NFut_10_pref_travel", - "NFut_10_pers_extravert", "NFut_10_pers_critical", "NFut_10_pers_dependable", "NFut_10_pers_anxious", "NFut_10_pers_complex", - "NFut_10_val_obey", "NFut_10_val_trad", "NFut_10_val_opinion", "NFut_10_val_performance", "NFut_10_val_justice" - )]), na.rm = TRUE) - cat("N10_global_mean: Calculated =", calc11, "| Stored =", data$N10_global_mean[i], - "| Match:", isTRUE(all.equal(calc11, data$N10_global_mean[i])), "\n\n") -} - -cat("=== QA CHECK COMPLETE ===\n") diff --git a/.history/eohi2/dataP 09 - interval x direction means_20251008113613.r b/.history/eohi2/dataP 09 - interval x direction means_20251008113613.r deleted file mode 100644 index 6bd02b7..0000000 --- a/.history/eohi2/dataP 09 - interval x direction means_20251008113613.r +++ /dev/null @@ -1,223 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load data -data <- read.csv("eohi2.csv") - -# Set 1: NPast_5_mean (15 variables) -data$NPast_5_mean <- rowMeans(data[, c( - "NPast_5_pref_read", "NPast_5_pref_music", "NPast_5_pref_TV", "NPast_5_pref_nap", "NPast_5_pref_travel", - "NPast_5_pers_extravert", "NPast_5_pers_critical", "NPast_5_pers_dependable", "NPast_5_pers_anxious", "NPast_5_pers_complex", - "NPast_5_val_obey", "NPast_5_val_trad", "NPast_5_val_opinion", "NPast_5_val_performance", "NPast_5_val_justice" -)], na.rm = TRUE) - -# Set 2: NPast_10_mean (15 variables) -data$NPast_10_mean <- rowMeans(data[, c( - "NPast_10_pref_read", "NPast_10_pref_music", "NPast_10_pref_TV", "NPast_10_pref_nap", "NPast_10_pref_travel", - "NPast_10_pers_extravert", "NPast_10_pers_critical", "NPast_10_pers_dependable", "NPast_10_pers_anxious", "NPast_10_pers_complex", - "NPast_10_val_obey", "NPast_10_val_trad", "NPast_10_val_opinion", "NPast_10_val_performance", "NPast_10_val_justice" -)], na.rm = TRUE) - -# Set 3: NFut_5_mean (15 variables) -data$NFut_5_mean <- rowMeans(data[, c( - "NFut_5_pref_read", "NFut_5_pref_music", "NFut_5_pref_TV", "NFut_5_pref_nap", "NFut_5_pref_travel", - "NFut_5_pers_extravert", "NFut_5_pers_critical", "NFut_5_pers_dependable", "NFut_5_pers_anxious", "NFut_5_pers_complex", - "NFut_5_val_obey", "NFut_5_val_trad", "NFut_5_val_opinion", "NFut_5_val_performance", "NFut_5_val_justice" -)], na.rm = TRUE) - -# Set 4: NFut_10_mean (15 variables) -data$NFut_10_mean <- rowMeans(data[, c( - "NFut_10_pref_read", "NFut_10_pref_music", "NFut_10_pref_TV", "NFut_10_pref_nap", "NFut_10_pref_travel", - "NFut_10_pers_extravert", "NFut_10_pers_critical", "NFut_10_pers_dependable", "NFut_10_pers_anxious", "NFut_10_pers_complex", - "NFut_10_val_obey", "NFut_10_val_trad", "NFut_10_val_opinion", "NFut_10_val_performance", "NFut_10_val_justice" -)], na.rm = TRUE) - -# Set 5: X5.10past_mean (15 variables) -data$X5.10past_mean <- rowMeans(data[, c( - "X5.10past_pref_read", "X5.10past_pref_music", "X5.10past_pref_TV", "X5.10past_pref_nap", "X5.10past_pref_travel", - "X5.10past_pers_extravert", "X5.10past_pers_critical", "X5.10past_pers_dependable", "X5.10past_pers_anxious", "X5.10past_pers_complex", - "X5.10past_val_obey", "X5.10past_val_trad", "X5.10past_val_opinion", "X5.10past_val_performance", "X5.10past_val_justice" -)], na.rm = TRUE) - -# Set 6: X5.10fut_mean (15 variables) -data$X5.10fut_mean <- rowMeans(data[, c( - "X5.10fut_pref_read", "X5.10fut_pref_music", "X5.10fut_pref_TV", "X5.10fut_pref_nap", "X5.10fut_pref_travel", - "X5.10fut_pers_extravert", "X5.10fut_pers_critical", "X5.10fut_pers_dependable", "X5.10fut_pers_anxious", "X5.10fut_pers_complex", - "X5.10fut_val_obey", "X5.10fut_val_trad", "X5.10fut_val_opinion", "X5.10fut_val_performance", "X5.10fut_val_justice" -)], na.rm = TRUE) - -# Set 7: NPast_global_mean (30 variables - NPast_5 + NPast_10) -data$NPast_global_mean <- rowMeans(data[, c( - "NPast_5_pref_read", "NPast_5_pref_music", "NPast_5_pref_TV", "NPast_5_pref_nap", "NPast_5_pref_travel", - "NPast_5_pers_extravert", "NPast_5_pers_critical", "NPast_5_pers_dependable", "NPast_5_pers_anxious", "NPast_5_pers_complex", - "NPast_5_val_obey", "NPast_5_val_trad", "NPast_5_val_opinion", "NPast_5_val_performance", "NPast_5_val_justice", - "NPast_10_pref_read", "NPast_10_pref_music", "NPast_10_pref_TV", "NPast_10_pref_nap", "NPast_10_pref_travel", - "NPast_10_pers_extravert", "NPast_10_pers_critical", "NPast_10_pers_dependable", "NPast_10_pers_anxious", "NPast_10_pers_complex", - "NPast_10_val_obey", "NPast_10_val_trad", "NPast_10_val_opinion", "NPast_10_val_performance", "NPast_10_val_justice" -)], na.rm = TRUE) - -# Set 8: NFut_global_mean (30 variables - NFut_5 + NFut_10) -data$NFut_global_mean <- rowMeans(data[, c( - "NFut_5_pref_read", "NFut_5_pref_music", "NFut_5_pref_TV", "NFut_5_pref_nap", "NFut_5_pref_travel", - "NFut_5_pers_extravert", "NFut_5_pers_critical", "NFut_5_pers_dependable", "NFut_5_pers_anxious", "NFut_5_pers_complex", - "NFut_5_val_obey", "NFut_5_val_trad", "NFut_5_val_opinion", "NFut_5_val_performance", "NFut_5_val_justice", - "NFut_10_pref_read", "NFut_10_pref_music", "NFut_10_pref_TV", "NFut_10_pref_nap", "NFut_10_pref_travel", - "NFut_10_pers_extravert", "NFut_10_pers_critical", "NFut_10_pers_dependable", "NFut_10_pers_anxious", "NFut_10_pers_complex", - "NFut_10_val_obey", "NFut_10_val_trad", "NFut_10_val_opinion", "NFut_10_val_performance", "NFut_10_val_justice" -)], na.rm = TRUE) - -# Set 9: X5.10_global_mean (30 variables - X5.10past + X5.10fut) -data$X5.10_global_mean <- rowMeans(data[, c( - "X5.10past_pref_read", "X5.10past_pref_music", "X5.10past_pref_TV", "X5.10past_pref_nap", "X5.10past_pref_travel", - "X5.10past_pers_extravert", "X5.10past_pers_critical", "X5.10past_pers_dependable", "X5.10past_pers_anxious", "X5.10past_pers_complex", - "X5.10past_val_obey", "X5.10past_val_trad", "X5.10past_val_opinion", "X5.10past_val_performance", "X5.10past_val_justice", - "X5.10fut_pref_read", "X5.10fut_pref_music", "X5.10fut_pref_TV", "X5.10fut_pref_nap", "X5.10fut_pref_travel", - "X5.10fut_pers_extravert", "X5.10fut_pers_critical", "X5.10fut_pers_dependable", "X5.10fut_pers_anxious", "X5.10fut_pers_complex", - "X5.10fut_val_obey", "X5.10fut_val_trad", "X5.10fut_val_opinion", "X5.10fut_val_performance", "X5.10fut_val_justice" -)], na.rm = TRUE) - -# Set 10: N5_global_mean (30 variables - NPast_5 + NFut_5) -data$N5_global_mean <- rowMeans(data[, c( - "NPast_5_pref_read", "NPast_5_pref_music", "NPast_5_pref_TV", "NPast_5_pref_nap", "NPast_5_pref_travel", - "NPast_5_pers_extravert", "NPast_5_pers_critical", "NPast_5_pers_dependable", "NPast_5_pers_anxious", "NPast_5_pers_complex", - "NPast_5_val_obey", "NPast_5_val_trad", "NPast_5_val_opinion", "NPast_5_val_performance", "NPast_5_val_justice", - "NFut_5_pref_read", "NFut_5_pref_music", "NFut_5_pref_TV", "NFut_5_pref_nap", "NFut_5_pref_travel", - "NFut_5_pers_extravert", "NFut_5_pers_critical", "NFut_5_pers_dependable", "NFut_5_pers_anxious", "NFut_5_pers_complex", - "NFut_5_val_obey", "NFut_5_val_trad", "NFut_5_val_opinion", "NFut_5_val_performance", "NFut_5_val_justice" -)], na.rm = TRUE) - -# Set 11: N10_global_mean (30 variables - NPast_10 + NFut_10) -data$N10_global_mean <- rowMeans(data[, c( - "NPast_10_pref_read", "NPast_10_pref_music", "NPast_10_pref_TV", "NPast_10_pref_nap", "NPast_10_pref_travel", - "NPast_10_pers_extravert", "NPast_10_pers_critical", "NPast_10_pers_dependable", "NPast_10_pers_anxious", "NPast_10_pers_complex", - "NPast_10_val_obey", "NPast_10_val_trad", "NPast_10_val_opinion", "NPast_10_val_performance", "NPast_10_val_justice", - "NFut_10_pref_read", "NFut_10_pref_music", "NFut_10_pref_TV", "NFut_10_pref_nap", "NFut_10_pref_travel", - "NFut_10_pers_extravert", "NFut_10_pers_critical", "NFut_10_pers_dependable", "NFut_10_pers_anxious", "NFut_10_pers_complex", - "NFut_10_val_obey", "NFut_10_val_trad", "NFut_10_val_opinion", "NFut_10_val_performance", "NFut_10_val_justice" -)], na.rm = TRUE) - -# Save the data -write.csv(data, "eohi2.csv", row.names = FALSE) - -# ===== QA CODE: Check first 5 rows ===== -cat("\n=== QUALITY ASSURANCE: Checking calculations for first 5 rows ===\n\n") - -for (i in 1:min(5, nrow(data))) { - cat("--- Row", i, "---\n") - - # Set 1: NPast_5_mean - calc1 <- mean(as.numeric(data[i, c( - "NPast_5_pref_read", "NPast_5_pref_music", "NPast_5_pref_TV", "NPast_5_pref_nap", "NPast_5_pref_travel", - "NPast_5_pers_extravert", "NPast_5_pers_critical", "NPast_5_pers_dependable", "NPast_5_pers_anxious", "NPast_5_pers_complex", - "NPast_5_val_obey", "NPast_5_val_trad", "NPast_5_val_opinion", "NPast_5_val_performance", "NPast_5_val_justice" - )]), na.rm = TRUE) - cat("NPast_5_mean: Calculated =", calc1, "| Stored =", data$NPast_5_mean[i], - "| Match:", isTRUE(all.equal(calc1, data$NPast_5_mean[i])), "\n") - - # Set 2: NPast_10_mean - calc2 <- mean(as.numeric(data[i, c( - "NPast_10_pref_read", "NPast_10_pref_music", "NPast_10_pref_TV", "NPast_10_pref_nap", "NPast_10_pref_travel", - "NPast_10_pers_extravert", "NPast_10_pers_critical", "NPast_10_pers_dependable", "NPast_10_pers_anxious", "NPast_10_pers_complex", - "NPast_10_val_obey", "NPast_10_val_trad", "NPast_10_val_opinion", "NPast_10_val_performance", "NPast_10_val_justice" - )]), na.rm = TRUE) - cat("NPast_10_mean: Calculated =", calc2, "| Stored =", data$NPast_10_mean[i], - "| Match:", isTRUE(all.equal(calc2, data$NPast_10_mean[i])), "\n") - - # Set 3: NFut_5_mean - calc3 <- mean(as.numeric(data[i, c( - "NFut_5_pref_read", "NFut_5_pref_music", "NFut_5_pref_TV", "NFut_5_pref_nap", "NFut_5_pref_travel", - "NFut_5_pers_extravert", "NFut_5_pers_critical", "NFut_5_pers_dependable", "NFut_5_pers_anxious", "NFut_5_pers_complex", - "NFut_5_val_obey", "NFut_5_val_trad", "NFut_5_val_opinion", "NFut_5_val_performance", "NFut_5_val_justice" - )]), na.rm = TRUE) - cat("NFut_5_mean: Calculated =", calc3, "| Stored =", data$NFut_5_mean[i], - "| Match:", isTRUE(all.equal(calc3, data$NFut_5_mean[i])), "\n") - - # Set 4: NFut_10_mean - calc4 <- mean(as.numeric(data[i, c( - "NFut_10_pref_read", "NFut_10_pref_music", "NFut_10_pref_TV", "NFut_10_pref_nap", "NFut_10_pref_travel", - "NFut_10_pers_extravert", "NFut_10_pers_critical", "NFut_10_pers_dependable", "NFut_10_pers_anxious", "NFut_10_pers_complex", - "NFut_10_val_obey", "NFut_10_val_trad", "NFut_10_val_opinion", "NFut_10_val_performance", "NFut_10_val_justice" - )]), na.rm = TRUE) - cat("NFut_10_mean: Calculated =", calc4, "| Stored =", data$NFut_10_mean[i], - "| Match:", isTRUE(all.equal(calc4, data$NFut_10_mean[i])), "\n") - - # Set 5: X5.10past_mean - calc5 <- mean(as.numeric(data[i, c( - "X5.10past_pref_read", "X5.10past_pref_music", "X5.10past_pref_TV", "X5.10past_pref_nap", "X5.10past_pref_travel", - "X5.10past_pers_extravert", "X5.10past_pers_critical", "X5.10past_pers_dependable", "X5.10past_pers_anxious", "X5.10past_pers_complex", - "X5.10past_val_obey", "X5.10past_val_trad", "X5.10past_val_opinion", "X5.10past_val_performance", "X5.10past_val_justice" - )]), na.rm = TRUE) - cat("X5.10past_mean: Calculated =", calc5, "| Stored =", data$X5.10past_mean[i], - "| Match:", isTRUE(all.equal(calc5, data$X5.10past_mean[i])), "\n") - - # Set 6: X5.10fut_mean - calc6 <- mean(as.numeric(data[i, c( - "X5.10fut_pref_read", "X5.10fut_pref_music", "X5.10fut_pref_TV", "X5.10fut_pref_nap", "X5.10fut_pref_travel", - "X5.10fut_pers_extravert", "X5.10fut_pers_critical", "X5.10fut_pers_dependable", "X5.10fut_pers_anxious", "X5.10fut_pers_complex", - "X5.10fut_val_obey", "X5.10fut_val_trad", "X5.10fut_val_opinion", "X5.10fut_val_performance", "X5.10fut_val_justice" - )]), na.rm = TRUE) - cat("X5.10fut_mean: Calculated =", calc6, "| Stored =", data$X5.10fut_mean[i], - "| Match:", isTRUE(all.equal(calc6, data$X5.10fut_mean[i])), "\n") - - # Set 7: NPast_global_mean - calc7 <- mean(as.numeric(data[i, c( - "NPast_5_pref_read", "NPast_5_pref_music", "NPast_5_pref_TV", "NPast_5_pref_nap", "NPast_5_pref_travel", - "NPast_5_pers_extravert", "NPast_5_pers_critical", "NPast_5_pers_dependable", "NPast_5_pers_anxious", "NPast_5_pers_complex", - "NPast_5_val_obey", "NPast_5_val_trad", "NPast_5_val_opinion", "NPast_5_val_performance", "NPast_5_val_justice", - "NPast_10_pref_read", "NPast_10_pref_music", "NPast_10_pref_TV", "NPast_10_pref_nap", "NPast_10_pref_travel", - "NPast_10_pers_extravert", "NPast_10_pers_critical", "NPast_10_pers_dependable", "NPast_10_pers_anxious", "NPast_10_pers_complex", - "NPast_10_val_obey", "NPast_10_val_trad", "NPast_10_val_opinion", "NPast_10_val_performance", "NPast_10_val_justice" - )]), na.rm = TRUE) - cat("NPast_global_mean: Calculated =", calc7, "| Stored =", data$NPast_global_mean[i], - "| Match:", isTRUE(all.equal(calc7, data$NPast_global_mean[i])), "\n") - - # Set 8: NFut_global_mean - calc8 <- mean(as.numeric(data[i, c( - "NFut_5_pref_read", "NFut_5_pref_music", "NFut_5_pref_TV", "NFut_5_pref_nap", "NFut_5_pref_travel", - "NFut_5_pers_extravert", "NFut_5_pers_critical", "NFut_5_pers_dependable", "NFut_5_pers_anxious", "NFut_5_pers_complex", - "NFut_5_val_obey", "NFut_5_val_trad", "NFut_5_val_opinion", "NFut_5_val_performance", "NFut_5_val_justice", - "NFut_10_pref_read", "NFut_10_pref_music", "NFut_10_pref_TV", "NFut_10_pref_nap", "NFut_10_pref_travel", - "NFut_10_pers_extravert", "NFut_10_pers_critical", "NFut_10_pers_dependable", "NFut_10_pers_anxious", "NFut_10_pers_complex", - "NFut_10_val_obey", "NFut_10_val_trad", "NFut_10_val_opinion", "NFut_10_val_performance", "NFut_10_val_justice" - )]), na.rm = TRUE) - cat("NFut_global_mean: Calculated =", calc8, "| Stored =", data$NFut_global_mean[i], - "| Match:", isTRUE(all.equal(calc8, data$NFut_global_mean[i])), "\n") - - # Set 9: X5.10_global_mean - calc9 <- mean(as.numeric(data[i, c( - "X5.10past_pref_read", "X5.10past_pref_music", "X5.10past_pref_TV", "X5.10past_pref_nap", "X5.10past_pref_travel", - "X5.10past_pers_extravert", "X5.10past_pers_critical", "X5.10past_pers_dependable", "X5.10past_pers_anxious", "X5.10past_pers_complex", - "X5.10past_val_obey", "X5.10past_val_trad", "X5.10past_val_opinion", "X5.10past_val_performance", "X5.10past_val_justice", - "X5.10fut_pref_read", "X5.10fut_pref_music", "X5.10fut_pref_TV", "X5.10fut_pref_nap", "X5.10fut_pref_travel", - "X5.10fut_pers_extravert", "X5.10fut_pers_critical", "X5.10fut_pers_dependable", "X5.10fut_pers_anxious", "X5.10fut_pers_complex", - "X5.10fut_val_obey", "X5.10fut_val_trad", "X5.10fut_val_opinion", "X5.10fut_val_performance", "X5.10fut_val_justice" - )]), na.rm = TRUE) - cat("X5.10_global_mean: Calculated =", calc9, "| Stored =", data$X5.10_global_mean[i], - "| Match:", isTRUE(all.equal(calc9, data$X5.10_global_mean[i])), "\n") - - # Set 10: N5_global_mean - calc10 <- mean(as.numeric(data[i, c( - "NPast_5_pref_read", "NPast_5_pref_music", "NPast_5_pref_TV", "NPast_5_pref_nap", "NPast_5_pref_travel", - "NPast_5_pers_extravert", "NPast_5_pers_critical", "NPast_5_pers_dependable", "NPast_5_pers_anxious", "NPast_5_pers_complex", - "NPast_5_val_obey", "NPast_5_val_trad", "NPast_5_val_opinion", "NPast_5_val_performance", "NPast_5_val_justice", - "NFut_5_pref_read", "NFut_5_pref_music", "NFut_5_pref_TV", "NFut_5_pref_nap", "NFut_5_pref_travel", - "NFut_5_pers_extravert", "NFut_5_pers_critical", "NFut_5_pers_dependable", "NFut_5_pers_anxious", "NFut_5_pers_complex", - "NFut_5_val_obey", "NFut_5_val_trad", "NFut_5_val_opinion", "NFut_5_val_performance", "NFut_5_val_justice" - )]), na.rm = TRUE) - cat("N5_global_mean: Calculated =", calc10, "| Stored =", data$N5_global_mean[i], - "| Match:", isTRUE(all.equal(calc10, data$N5_global_mean[i])), "\n") - - # Set 11: N10_global_mean - calc11 <- mean(as.numeric(data[i, c( - "NPast_10_pref_read", "NPast_10_pref_music", "NPast_10_pref_TV", "NPast_10_pref_nap", "NPast_10_pref_travel", - "NPast_10_pers_extravert", "NPast_10_pers_critical", "NPast_10_pers_dependable", "NPast_10_pers_anxious", "NPast_10_pers_complex", - "NPast_10_val_obey", "NPast_10_val_trad", "NPast_10_val_opinion", "NPast_10_val_performance", "NPast_10_val_justice", - "NFut_10_pref_read", "NFut_10_pref_music", "NFut_10_pref_TV", "NFut_10_pref_nap", "NFut_10_pref_travel", - "NFut_10_pers_extravert", "NFut_10_pers_critical", "NFut_10_pers_dependable", "NFut_10_pers_anxious", "NFut_10_pers_complex", - "NFut_10_val_obey", "NFut_10_val_trad", "NFut_10_val_opinion", "NFut_10_val_performance", "NFut_10_val_justice" - )]), na.rm = TRUE) - cat("N10_global_mean: Calculated =", calc11, "| Stored =", data$N10_global_mean[i], - "| Match:", isTRUE(all.equal(calc11, data$N10_global_mean[i])), "\n\n") -} - -cat("=== QA CHECK COMPLETE ===\n") diff --git a/.history/eohi2/dataP 10 - DGEN mean vars_20251008121818.r b/.history/eohi2/dataP 10 - DGEN mean vars_20251008121818.r deleted file mode 100644 index 2e5647b..0000000 --- a/.history/eohi2/dataP 10 - DGEN mean vars_20251008121818.r +++ /dev/null @@ -1,115 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load data -data <- read.csv("eohi2.csv") - -# Set 1: DGEN_past_5.10_mean (3 variables) -data$DGEN_past_5.10_mean <- rowMeans(data[, c( - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val" -)], na.rm = TRUE) - -# Set 2: DGEN_fut_5.10_mean (3 variables) -data$DGEN_fut_5.10_mean <- rowMeans(data[, c( - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val" -)], na.rm = TRUE) - -# Set 3: DGENpast_global_mean (9 variables) -data$DGENpast_global_mean <- rowMeans(data[, c( - "DGEN_past_5_Pref", "DGEN_past_10_Pref", "X5_10DGEN_past_pref", - "DGEN_past_5_Pers", "DGEN_past_10_Pers", "X5_10DGEN_past_pers", - "DGEN_past_5_Val", "DGEN_past_10_Val", "X5_10DGEN_past_val" -)], na.rm = TRUE) - -# Set 4: DGENfut_global_mean (9 variables) -data$DGENfut_global_mean <- rowMeans(data[, c( - "DGEN_fut_5_Pref", "DGEN_fut_10_Pref", "X5_10DGEN_fut_pref", - "DGEN_fut_5_Pers", "DGEN_fut_10_Pers", "X5_10DGEN_fut_pers", - "DGEN_fut_5_Val", "DGEN_fut_10_Val", "X5_10DGEN_fut_val" -)], na.rm = TRUE) - -# Set 5: DGEN_5_global_mean (6 variables) -data$DGEN_5_global_mean <- rowMeans(data[, c( - "DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val" -)], na.rm = TRUE) - -# Set 6: DGEN_10_global_mean (6 variables) -data$DGEN_10_global_mean <- rowMeans(data[, c( - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val" -)], na.rm = TRUE) - -# Set 7: DGEN_5.10_global_mean (6 variables) -data$DGEN_5.10_global_mean <- rowMeans(data[, c( - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val" -)], na.rm = TRUE) - -# Save the data -write.csv(data, "eohi2.csv", row.names = FALSE) - -# ===== QA CODE: Check first 5 rows ===== -cat("\n=== QUALITY ASSURANCE: Checking calculations for first 5 rows ===\n\n") - -for (i in 1:min(5, nrow(data))) { - cat("--- Row", i, "---\n") - - # Set 1: DGEN_past_5.10_mean - calc1 <- mean(as.numeric(data[i, c( - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val" - )]), na.rm = TRUE) - cat("DGEN_past_5.10_mean: Calculated =", calc1, "| Stored =", data$DGEN_past_5.10_mean[i], - "| Match:", isTRUE(all.equal(calc1, data$DGEN_past_5.10_mean[i])), "\n") - - # Set 2: DGEN_fut_5.10_mean - calc2 <- mean(as.numeric(data[i, c( - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val" - )]), na.rm = TRUE) - cat("DGEN_fut_5.10_mean: Calculated =", calc2, "| Stored =", data$DGEN_fut_5.10_mean[i], - "| Match:", isTRUE(all.equal(calc2, data$DGEN_fut_5.10_mean[i])), "\n") - - # Set 3: DGENpast_global_mean - calc3 <- mean(as.numeric(data[i, c( - "DGEN_past_5_Pref", "DGEN_past_10_Pref", "X5_10DGEN_past_pref", - "DGEN_past_5_Pers", "DGEN_past_10_Pers", "X5_10DGEN_past_pers", - "DGEN_past_5_Val", "DGEN_past_10_Val", "X5_10DGEN_past_val" - )]), na.rm = TRUE) - cat("DGENpast_global_mean: Calculated =", calc3, "| Stored =", data$DGENpast_global_mean[i], - "| Match:", isTRUE(all.equal(calc3, data$DGENpast_global_mean[i])), "\n") - - # Set 4: DGENfut_global_mean - calc4 <- mean(as.numeric(data[i, c( - "DGEN_fut_5_Pref", "DGEN_fut_10_Pref", "X5_10DGEN_fut_pref", - "DGEN_fut_5_Pers", "DGEN_fut_10_Pers", "X5_10DGEN_fut_pers", - "DGEN_fut_5_Val", "DGEN_fut_10_Val", "X5_10DGEN_fut_val" - )]), na.rm = TRUE) - cat("DGENfut_global_mean: Calculated =", calc4, "| Stored =", data$DGENfut_global_mean[i], - "| Match:", isTRUE(all.equal(calc4, data$DGENfut_global_mean[i])), "\n") - - # Set 5: DGEN_5_global_mean - calc5 <- mean(as.numeric(data[i, c( - "DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val" - )]), na.rm = TRUE) - cat("DGEN_5_global_mean: Calculated =", calc5, "| Stored =", data$DGEN_5_global_mean[i], - "| Match:", isTRUE(all.equal(calc5, data$DGEN_5_global_mean[i])), "\n") - - # Set 6: DGEN_10_global_mean - calc6 <- mean(as.numeric(data[i, c( - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val" - )]), na.rm = TRUE) - cat("DGEN_10_global_mean: Calculated =", calc6, "| Stored =", data$DGEN_10_global_mean[i], - "| Match:", isTRUE(all.equal(calc6, data$DGEN_10_global_mean[i])), "\n") - - # Set 7: DGEN_5.10_global_mean - calc7 <- mean(as.numeric(data[i, c( - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val" - )]), na.rm = TRUE) - cat("DGEN_5.10_global_mean: Calculated =", calc7, "| Stored =", data$DGEN_5.10_global_mean[i], - "| Match:", isTRUE(all.equal(calc7, data$DGEN_5.10_global_mean[i])), "\n\n") -} - -cat("=== QA CHECK COMPLETE ===\n") diff --git a/.history/eohi2/dataP 10 - DGEN mean vars_20251008121822.r b/.history/eohi2/dataP 10 - DGEN mean vars_20251008121822.r deleted file mode 100644 index 2e5647b..0000000 --- a/.history/eohi2/dataP 10 - DGEN mean vars_20251008121822.r +++ /dev/null @@ -1,115 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load data -data <- read.csv("eohi2.csv") - -# Set 1: DGEN_past_5.10_mean (3 variables) -data$DGEN_past_5.10_mean <- rowMeans(data[, c( - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val" -)], na.rm = TRUE) - -# Set 2: DGEN_fut_5.10_mean (3 variables) -data$DGEN_fut_5.10_mean <- rowMeans(data[, c( - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val" -)], na.rm = TRUE) - -# Set 3: DGENpast_global_mean (9 variables) -data$DGENpast_global_mean <- rowMeans(data[, c( - "DGEN_past_5_Pref", "DGEN_past_10_Pref", "X5_10DGEN_past_pref", - "DGEN_past_5_Pers", "DGEN_past_10_Pers", "X5_10DGEN_past_pers", - "DGEN_past_5_Val", "DGEN_past_10_Val", "X5_10DGEN_past_val" -)], na.rm = TRUE) - -# Set 4: DGENfut_global_mean (9 variables) -data$DGENfut_global_mean <- rowMeans(data[, c( - "DGEN_fut_5_Pref", "DGEN_fut_10_Pref", "X5_10DGEN_fut_pref", - "DGEN_fut_5_Pers", "DGEN_fut_10_Pers", "X5_10DGEN_fut_pers", - "DGEN_fut_5_Val", "DGEN_fut_10_Val", "X5_10DGEN_fut_val" -)], na.rm = TRUE) - -# Set 5: DGEN_5_global_mean (6 variables) -data$DGEN_5_global_mean <- rowMeans(data[, c( - "DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val" -)], na.rm = TRUE) - -# Set 6: DGEN_10_global_mean (6 variables) -data$DGEN_10_global_mean <- rowMeans(data[, c( - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val" -)], na.rm = TRUE) - -# Set 7: DGEN_5.10_global_mean (6 variables) -data$DGEN_5.10_global_mean <- rowMeans(data[, c( - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val" -)], na.rm = TRUE) - -# Save the data -write.csv(data, "eohi2.csv", row.names = FALSE) - -# ===== QA CODE: Check first 5 rows ===== -cat("\n=== QUALITY ASSURANCE: Checking calculations for first 5 rows ===\n\n") - -for (i in 1:min(5, nrow(data))) { - cat("--- Row", i, "---\n") - - # Set 1: DGEN_past_5.10_mean - calc1 <- mean(as.numeric(data[i, c( - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val" - )]), na.rm = TRUE) - cat("DGEN_past_5.10_mean: Calculated =", calc1, "| Stored =", data$DGEN_past_5.10_mean[i], - "| Match:", isTRUE(all.equal(calc1, data$DGEN_past_5.10_mean[i])), "\n") - - # Set 2: DGEN_fut_5.10_mean - calc2 <- mean(as.numeric(data[i, c( - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val" - )]), na.rm = TRUE) - cat("DGEN_fut_5.10_mean: Calculated =", calc2, "| Stored =", data$DGEN_fut_5.10_mean[i], - "| Match:", isTRUE(all.equal(calc2, data$DGEN_fut_5.10_mean[i])), "\n") - - # Set 3: DGENpast_global_mean - calc3 <- mean(as.numeric(data[i, c( - "DGEN_past_5_Pref", "DGEN_past_10_Pref", "X5_10DGEN_past_pref", - "DGEN_past_5_Pers", "DGEN_past_10_Pers", "X5_10DGEN_past_pers", - "DGEN_past_5_Val", "DGEN_past_10_Val", "X5_10DGEN_past_val" - )]), na.rm = TRUE) - cat("DGENpast_global_mean: Calculated =", calc3, "| Stored =", data$DGENpast_global_mean[i], - "| Match:", isTRUE(all.equal(calc3, data$DGENpast_global_mean[i])), "\n") - - # Set 4: DGENfut_global_mean - calc4 <- mean(as.numeric(data[i, c( - "DGEN_fut_5_Pref", "DGEN_fut_10_Pref", "X5_10DGEN_fut_pref", - "DGEN_fut_5_Pers", "DGEN_fut_10_Pers", "X5_10DGEN_fut_pers", - "DGEN_fut_5_Val", "DGEN_fut_10_Val", "X5_10DGEN_fut_val" - )]), na.rm = TRUE) - cat("DGENfut_global_mean: Calculated =", calc4, "| Stored =", data$DGENfut_global_mean[i], - "| Match:", isTRUE(all.equal(calc4, data$DGENfut_global_mean[i])), "\n") - - # Set 5: DGEN_5_global_mean - calc5 <- mean(as.numeric(data[i, c( - "DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val" - )]), na.rm = TRUE) - cat("DGEN_5_global_mean: Calculated =", calc5, "| Stored =", data$DGEN_5_global_mean[i], - "| Match:", isTRUE(all.equal(calc5, data$DGEN_5_global_mean[i])), "\n") - - # Set 6: DGEN_10_global_mean - calc6 <- mean(as.numeric(data[i, c( - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val" - )]), na.rm = TRUE) - cat("DGEN_10_global_mean: Calculated =", calc6, "| Stored =", data$DGEN_10_global_mean[i], - "| Match:", isTRUE(all.equal(calc6, data$DGEN_10_global_mean[i])), "\n") - - # Set 7: DGEN_5.10_global_mean - calc7 <- mean(as.numeric(data[i, c( - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val" - )]), na.rm = TRUE) - cat("DGEN_5.10_global_mean: Calculated =", calc7, "| Stored =", data$DGEN_5.10_global_mean[i], - "| Match:", isTRUE(all.equal(calc7, data$DGEN_5.10_global_mean[i])), "\n\n") -} - -cat("=== QA CHECK COMPLETE ===\n") diff --git a/.history/eohi2/dataP 10 - DGEN mean vars_20251008121849.r b/.history/eohi2/dataP 10 - DGEN mean vars_20251008121849.r deleted file mode 100644 index 2e5647b..0000000 --- a/.history/eohi2/dataP 10 - DGEN mean vars_20251008121849.r +++ /dev/null @@ -1,115 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load data -data <- read.csv("eohi2.csv") - -# Set 1: DGEN_past_5.10_mean (3 variables) -data$DGEN_past_5.10_mean <- rowMeans(data[, c( - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val" -)], na.rm = TRUE) - -# Set 2: DGEN_fut_5.10_mean (3 variables) -data$DGEN_fut_5.10_mean <- rowMeans(data[, c( - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val" -)], na.rm = TRUE) - -# Set 3: DGENpast_global_mean (9 variables) -data$DGENpast_global_mean <- rowMeans(data[, c( - "DGEN_past_5_Pref", "DGEN_past_10_Pref", "X5_10DGEN_past_pref", - "DGEN_past_5_Pers", "DGEN_past_10_Pers", "X5_10DGEN_past_pers", - "DGEN_past_5_Val", "DGEN_past_10_Val", "X5_10DGEN_past_val" -)], na.rm = TRUE) - -# Set 4: DGENfut_global_mean (9 variables) -data$DGENfut_global_mean <- rowMeans(data[, c( - "DGEN_fut_5_Pref", "DGEN_fut_10_Pref", "X5_10DGEN_fut_pref", - "DGEN_fut_5_Pers", "DGEN_fut_10_Pers", "X5_10DGEN_fut_pers", - "DGEN_fut_5_Val", "DGEN_fut_10_Val", "X5_10DGEN_fut_val" -)], na.rm = TRUE) - -# Set 5: DGEN_5_global_mean (6 variables) -data$DGEN_5_global_mean <- rowMeans(data[, c( - "DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val" -)], na.rm = TRUE) - -# Set 6: DGEN_10_global_mean (6 variables) -data$DGEN_10_global_mean <- rowMeans(data[, c( - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val" -)], na.rm = TRUE) - -# Set 7: DGEN_5.10_global_mean (6 variables) -data$DGEN_5.10_global_mean <- rowMeans(data[, c( - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val" -)], na.rm = TRUE) - -# Save the data -write.csv(data, "eohi2.csv", row.names = FALSE) - -# ===== QA CODE: Check first 5 rows ===== -cat("\n=== QUALITY ASSURANCE: Checking calculations for first 5 rows ===\n\n") - -for (i in 1:min(5, nrow(data))) { - cat("--- Row", i, "---\n") - - # Set 1: DGEN_past_5.10_mean - calc1 <- mean(as.numeric(data[i, c( - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val" - )]), na.rm = TRUE) - cat("DGEN_past_5.10_mean: Calculated =", calc1, "| Stored =", data$DGEN_past_5.10_mean[i], - "| Match:", isTRUE(all.equal(calc1, data$DGEN_past_5.10_mean[i])), "\n") - - # Set 2: DGEN_fut_5.10_mean - calc2 <- mean(as.numeric(data[i, c( - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val" - )]), na.rm = TRUE) - cat("DGEN_fut_5.10_mean: Calculated =", calc2, "| Stored =", data$DGEN_fut_5.10_mean[i], - "| Match:", isTRUE(all.equal(calc2, data$DGEN_fut_5.10_mean[i])), "\n") - - # Set 3: DGENpast_global_mean - calc3 <- mean(as.numeric(data[i, c( - "DGEN_past_5_Pref", "DGEN_past_10_Pref", "X5_10DGEN_past_pref", - "DGEN_past_5_Pers", "DGEN_past_10_Pers", "X5_10DGEN_past_pers", - "DGEN_past_5_Val", "DGEN_past_10_Val", "X5_10DGEN_past_val" - )]), na.rm = TRUE) - cat("DGENpast_global_mean: Calculated =", calc3, "| Stored =", data$DGENpast_global_mean[i], - "| Match:", isTRUE(all.equal(calc3, data$DGENpast_global_mean[i])), "\n") - - # Set 4: DGENfut_global_mean - calc4 <- mean(as.numeric(data[i, c( - "DGEN_fut_5_Pref", "DGEN_fut_10_Pref", "X5_10DGEN_fut_pref", - "DGEN_fut_5_Pers", "DGEN_fut_10_Pers", "X5_10DGEN_fut_pers", - "DGEN_fut_5_Val", "DGEN_fut_10_Val", "X5_10DGEN_fut_val" - )]), na.rm = TRUE) - cat("DGENfut_global_mean: Calculated =", calc4, "| Stored =", data$DGENfut_global_mean[i], - "| Match:", isTRUE(all.equal(calc4, data$DGENfut_global_mean[i])), "\n") - - # Set 5: DGEN_5_global_mean - calc5 <- mean(as.numeric(data[i, c( - "DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val" - )]), na.rm = TRUE) - cat("DGEN_5_global_mean: Calculated =", calc5, "| Stored =", data$DGEN_5_global_mean[i], - "| Match:", isTRUE(all.equal(calc5, data$DGEN_5_global_mean[i])), "\n") - - # Set 6: DGEN_10_global_mean - calc6 <- mean(as.numeric(data[i, c( - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val" - )]), na.rm = TRUE) - cat("DGEN_10_global_mean: Calculated =", calc6, "| Stored =", data$DGEN_10_global_mean[i], - "| Match:", isTRUE(all.equal(calc6, data$DGEN_10_global_mean[i])), "\n") - - # Set 7: DGEN_5.10_global_mean - calc7 <- mean(as.numeric(data[i, c( - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val" - )]), na.rm = TRUE) - cat("DGEN_5.10_global_mean: Calculated =", calc7, "| Stored =", data$DGEN_5.10_global_mean[i], - "| Match:", isTRUE(all.equal(calc7, data$DGEN_5.10_global_mean[i])), "\n\n") -} - -cat("=== QA CHECK COMPLETE ===\n") diff --git a/.history/eohi2/dataP 11 - CORRECT ehi vars_20251008152253.r b/.history/eohi2/dataP 11 - CORRECT ehi vars_20251008152253.r deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi2/dataP 11 - CORRECT ehi vars_20251008163033.r b/.history/eohi2/dataP 11 - CORRECT ehi vars_20251008163033.r deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi2/dataP 11 - CORRECT ehi vars_20251008163045.r b/.history/eohi2/dataP 11 - CORRECT ehi vars_20251008163045.r deleted file mode 100644 index 4e6df17..0000000 --- a/.history/eohi2/dataP 11 - CORRECT ehi vars_20251008163045.r +++ /dev/null @@ -1,5 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load data -data <- read.csv("eohi2.csv") \ No newline at end of file diff --git a/.history/eohi2/dataP 11 - CORRECT ehi vars_20251008163815.r b/.history/eohi2/dataP 11 - CORRECT ehi vars_20251008163815.r deleted file mode 100644 index 96e4994..0000000 --- a/.history/eohi2/dataP 11 - CORRECT ehi vars_20251008163815.r +++ /dev/null @@ -1,235 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load data -data <- read.csv("eohi2.csv") - -# Create EHI difference variables (NPast - NFut) for different time intervals - -# === 5-YEAR PAST-FUTURE PAIRS === -# Preferences -data$ehi5_pref_read <- data$NPast_5_pref_read - data$NFut_5_pref_read -data$ehi5_pref_music <- data$NPast_5_pref_music - data$NFut_5_pref_music -data$ehi5_pref_TV <- data$NPast_5_pref_TV - data$NFut_5_pref_TV -data$ehi5_pref_nap <- data$NPast_5_pref_nap - data$NFut_5_pref_nap -data$ehi5_pref_travel <- data$NPast_5_pref_travel - data$NFut_5_pref_travel - -# Personality -data$ehi5_pers_extravert <- data$NPast_5_pers_extravert - data$NFut_5_pers_extravert -data$ehi5_pers_critical <- data$NPast_5_pers_critical - data$NFut_5_pers_critical -data$ehi5_pers_dependable <- data$NPast_5_pers_dependable - data$NFut_5_pers_dependable -data$ehi5_pers_anxious <- data$NPast_5_pers_anxious - data$NFut_5_pers_anxious -data$ehi5_pers_complex <- data$NPast_5_pers_complex - data$NFut_5_pers_complex - -# Values -data$ehi5_val_obey <- data$NPast_5_val_obey - data$NFut_5_val_obey -data$ehi5_val_trad <- data$NPast_5_val_trad - data$NFut_5_val_trad -data$ehi5_val_opinion <- data$NPast_5_val_opinion - data$NFut_5_val_opinion -data$ehi5_val_performance <- data$NPast_5_val_performance - data$NFut_5_val_performance -data$ehi5_val_justice <- data$NPast_5_val_justice - data$NFut_5_val_justice - -# === 10-YEAR PAST-FUTURE PAIRS === -# Preferences -data$ehi10_pref_read <- data$NPast_10_pref_read - data$NFut_10_pref_read -data$ehi10_pref_music <- data$NPast_10_pref_music - data$NFut_10_pref_music -data$ehi10_pref_TV <- data$NPast_10_pref_TV - data$NFut_10_pref_TV -data$ehi10_pref_nap <- data$NPast_10_pref_nap - data$NFut_10_pref_nap -data$ehi10_pref_travel <- data$NPast_10_pref_travel - data$NFut_10_pref_travel - -# Personality -data$ehi10_pers_extravert <- data$NPast_10_pers_extravert - data$NFut_10_pers_extravert -data$ehi10_pers_critical <- data$NPast_10_pers_critical - data$NFut_10_pers_critical -data$ehi10_pers_dependable <- data$NPast_10_pers_dependable - data$NFut_10_pers_dependable -data$ehi10_pers_anxious <- data$NPast_10_pers_anxious - data$NFut_10_pers_anxious -data$ehi10_pers_complex <- data$NPast_10_pers_complex - data$NFut_10_pers_complex - -# Values -data$ehi10_val_obey <- data$NPast_10_val_obey - data$NFut_10_val_obey -data$ehi10_val_trad <- data$NPast_10_val_trad - data$NFut_10_val_trad -data$ehi10_val_opinion <- data$NPast_10_val_opinion - data$NFut_10_val_opinion -data$ehi10_val_performance <- data$NPast_10_val_performance - data$NFut_10_val_performance -data$ehi10_val_justice <- data$NPast_10_val_justice - data$NFut_10_val_justice - -# === 5-10 YEAR CHANGE VARIABLES === -# Preferences -data$ehi5.10_pref_read <- data$X5.10past_pref_read - data$X5.10fut_pref_read -data$ehi5.10_pref_music <- data$X5.10past_pref_music - data$X5.10fut_pref_music -data$ehi5.10_pref_TV <- data$X5.10past_pref_TV - data$X5.10fut_pref_TV -data$ehi5.10_pref_nap <- data$X5.10past_pref_nap - data$X5.10fut_pref_nap -data$ehi5.10_pref_travel <- data$X5.10past_pref_travel - data$X5.10fut_pref_travel - -# Personality -data$ehi5.10_pers_extravert <- data$X5.10past_pers_extravert - data$X5.10fut_pers_extravert -data$ehi5.10_pers_critical <- data$X5.10past_pers_critical - data$X5.10fut_pers_critical -data$ehi5.10_pers_dependable <- data$X5.10past_pers_dependable - data$X5.10fut_pers_dependable -data$ehi5.10_pers_anxious <- data$X5.10past_pers_anxious - data$X5.10fut_pers_anxious -data$ehi5.10_pers_complex <- data$X5.10past_pers_complex - data$X5.10fut_pers_complex - -# Values -data$ehi5.10_val_obey <- data$X5.10past_val_obey - data$X5.10fut_val_obey -data$ehi5.10_val_trad <- data$X5.10past_val_trad - data$X5.10fut_val_trad -data$ehi5.10_val_opinion <- data$X5.10past_val_opinion - data$X5.10fut_val_opinion -data$ehi5.10_val_performance <- data$X5.10past_val_performance - data$X5.10fut_val_performance -data$ehi5.10_val_justice <- data$X5.10past_val_justice - data$X5.10fut_val_justice - -# QA: Verify calculations - FIRST 5 ROWS with detailed output -cat("\n=== QUALITY ASSURANCE CHECK - FIRST 5 ROWS ===\n\n") - -cat("--- 5-YEAR VARIABLES ---\n") -for (i in 1:5) { - cat(sprintf("\nRow %d:\n", i)) - cat(sprintf(" pref_read: %g - %g = %g | ehi5_pref_read = %g %s\n", - data$NPast_5_pref_read[i], data$NFut_5_pref_read[i], - data$NPast_5_pref_read[i] - data$NFut_5_pref_read[i], - data$ehi5_pref_read[i], - ifelse(abs((data$NPast_5_pref_read[i] - data$NFut_5_pref_read[i]) - data$ehi5_pref_read[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" pref_music: %g - %g = %g | ehi5_pref_music = %g %s\n", - data$NPast_5_pref_music[i], data$NFut_5_pref_music[i], - data$NPast_5_pref_music[i] - data$NFut_5_pref_music[i], - data$ehi5_pref_music[i], - ifelse(abs((data$NPast_5_pref_music[i] - data$NFut_5_pref_music[i]) - data$ehi5_pref_music[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" pers_extravert: %g - %g = %g | ehi5_pers_extravert = %g %s\n", - data$NPast_5_pers_extravert[i], data$NFut_5_pers_extravert[i], - data$NPast_5_pers_extravert[i] - data$NFut_5_pers_extravert[i], - data$ehi5_pers_extravert[i], - ifelse(abs((data$NPast_5_pers_extravert[i] - data$NFut_5_pers_extravert[i]) - data$ehi5_pers_extravert[i]) < 1e-10, "✓", "✗"))) -} - -cat("\n--- 10-YEAR VARIABLES ---\n") -for (i in 1:5) { - cat(sprintf("\nRow %d:\n", i)) - cat(sprintf(" pref_read: %g - %g = %g | ehi10_pref_read = %g %s\n", - data$NPast_10_pref_read[i], data$NFut_10_pref_read[i], - data$NPast_10_pref_read[i] - data$NFut_10_pref_read[i], - data$ehi10_pref_read[i], - ifelse(abs((data$NPast_10_pref_read[i] - data$NFut_10_pref_read[i]) - data$ehi10_pref_read[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" pref_music: %g - %g = %g | ehi10_pref_music = %g %s\n", - data$NPast_10_pref_music[i], data$NFut_10_pref_music[i], - data$NPast_10_pref_music[i] - data$NFut_10_pref_music[i], - data$ehi10_pref_music[i], - ifelse(abs((data$NPast_10_pref_music[i] - data$NFut_10_pref_music[i]) - data$ehi10_pref_music[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" pers_extravert: %g - %g = %g | ehi10_pers_extravert = %g %s\n", - data$NPast_10_pers_extravert[i], data$NFut_10_pers_extravert[i], - data$NPast_10_pers_extravert[i] - data$NFut_10_pers_extravert[i], - data$ehi10_pers_extravert[i], - ifelse(abs((data$NPast_10_pers_extravert[i] - data$NFut_10_pers_extravert[i]) - data$ehi10_pers_extravert[i]) < 1e-10, "✓", "✗"))) -} - -cat("\n--- 5-10 YEAR CHANGE VARIABLES ---\n") -for (i in 1:5) { - cat(sprintf("\nRow %d:\n", i)) - cat(sprintf(" pref_read: %g - %g = %g | ehi5.10_pref_read = %g %s\n", - data$X5.10past_pref_read[i], data$X5.10fut_pref_read[i], - data$X5.10past_pref_read[i] - data$X5.10fut_pref_read[i], - data$ehi5.10_pref_read[i], - ifelse(abs((data$X5.10past_pref_read[i] - data$X5.10fut_pref_read[i]) - data$ehi5.10_pref_read[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" pref_music: %g - %g = %g | ehi5.10_pref_music = %g %s\n", - data$X5.10past_pref_music[i], data$X5.10fut_pref_music[i], - data$X5.10past_pref_music[i] - data$X5.10fut_pref_music[i], - data$ehi5.10_pref_music[i], - ifelse(abs((data$X5.10past_pref_music[i] - data$X5.10fut_pref_music[i]) - data$ehi5.10_pref_music[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" pers_extravert: %g - %g = %g | ehi5.10_pers_extravert = %g %s\n", - data$X5.10past_pers_extravert[i], data$X5.10fut_pers_extravert[i], - data$X5.10past_pers_extravert[i] - data$X5.10fut_pers_extravert[i], - data$ehi5.10_pers_extravert[i], - ifelse(abs((data$X5.10past_pers_extravert[i] - data$X5.10fut_pers_extravert[i]) - data$ehi5.10_pers_extravert[i]) < 1e-10, "✓", "✗"))) -} - -# Full QA check for all rows and all variables -cat("\n\n=== OVERALL QA CHECK (ALL ROWS, ALL VARIABLES) ===\n") - -qa_pairs <- list( - # 5-year pairs - list(npast = "NPast_5_pref_read", nfut = "NFut_5_pref_read", target = "ehi5_pref_read"), - list(npast = "NPast_5_pref_music", nfut = "NFut_5_pref_music", target = "ehi5_pref_music"), - list(npast = "NPast_5_pref_TV", nfut = "NFut_5_pref_TV", target = "ehi5_pref_TV"), - list(npast = "NPast_5_pref_nap", nfut = "NFut_5_pref_nap", target = "ehi5_pref_nap"), - list(npast = "NPast_5_pref_travel", nfut = "NFut_5_pref_travel", target = "ehi5_pref_travel"), - list(npast = "NPast_5_pers_extravert", nfut = "NFut_5_pers_extravert", target = "ehi5_pers_extravert"), - list(npast = "NPast_5_pers_critical", nfut = "NFut_5_pers_critical", target = "ehi5_pers_critical"), - list(npast = "NPast_5_pers_dependable", nfut = "NFut_5_pers_dependable", target = "ehi5_pers_dependable"), - list(npast = "NPast_5_pers_anxious", nfut = "NFut_5_pers_anxious", target = "ehi5_pers_anxious"), - list(npast = "NPast_5_pers_complex", nfut = "NFut_5_pers_complex", target = "ehi5_pers_complex"), - list(npast = "NPast_5_val_obey", nfut = "NFut_5_val_obey", target = "ehi5_val_obey"), - list(npast = "NPast_5_val_trad", nfut = "NFut_5_val_trad", target = "ehi5_val_trad"), - list(npast = "NPast_5_val_opinion", nfut = "NFut_5_val_opinion", target = "ehi5_val_opinion"), - list(npast = "NPast_5_val_performance", nfut = "NFut_5_val_performance", target = "ehi5_val_performance"), - list(npast = "NPast_5_val_justice", nfut = "NFut_5_val_justice", target = "ehi5_val_justice"), - - # 10-year pairs - list(npast = "NPast_10_pref_read", nfut = "NFut_10_pref_read", target = "ehi10_pref_read"), - list(npast = "NPast_10_pref_music", nfut = "NFut_10_pref_music", target = "ehi10_pref_music"), - list(npast = "NPast_10_pref_TV", nfut = "NFut_10_pref_TV", target = "ehi10_pref_TV"), - list(npast = "NPast_10_pref_nap", nfut = "NFut_10_pref_nap", target = "ehi10_pref_nap"), - list(npast = "NPast_10_pref_travel", nfut = "NFut_10_pref_travel", target = "ehi10_pref_travel"), - list(npast = "NPast_10_pers_extravert", nfut = "NFut_10_pers_extravert", target = "ehi10_pers_extravert"), - list(npast = "NPast_10_pers_critical", nfut = "NFut_10_pers_critical", target = "ehi10_pers_critical"), - list(npast = "NPast_10_pers_dependable", nfut = "NFut_10_pers_dependable", target = "ehi10_pers_dependable"), - list(npast = "NPast_10_pers_anxious", nfut = "NFut_10_pers_anxious", target = "ehi10_pers_anxious"), - list(npast = "NPast_10_pers_complex", nfut = "NFut_10_pers_complex", target = "ehi10_pers_complex"), - list(npast = "NPast_10_val_obey", nfut = "NFut_10_val_obey", target = "ehi10_val_obey"), - list(npast = "NPast_10_val_trad", nfut = "NFut_10_val_trad", target = "ehi10_val_trad"), - list(npast = "NPast_10_val_opinion", nfut = "NFut_10_val_opinion", target = "ehi10_val_opinion"), - list(npast = "NPast_10_val_performance", nfut = "NFut_10_val_performance", target = "ehi10_val_performance"), - list(npast = "NPast_10_val_justice", nfut = "NFut_10_val_justice", target = "ehi10_val_justice"), - - # 5-10 year change pairs - list(npast = "X5.10past_pref_read", nfut = "X5.10fut_pref_read", target = "ehi5.10_pref_read"), - list(npast = "X5.10past_pref_music", nfut = "X5.10fut_pref_music", target = "ehi5.10_pref_music"), - list(npast = "X5.10past_pref_TV", nfut = "X5.10fut_pref_TV", target = "ehi5.10_pref_TV"), - list(npast = "X5.10past_pref_nap", nfut = "X5.10fut_pref_nap", target = "ehi5.10_pref_nap"), - list(npast = "X5.10past_pref_travel", nfut = "X5.10fut_pref_travel", target = "ehi5.10_pref_travel"), - list(npast = "X5.10past_pers_extravert", nfut = "X5.10fut_pers_extravert", target = "ehi5.10_pers_extravert"), - list(npast = "X5.10past_pers_critical", nfut = "X5.10fut_pers_critical", target = "ehi5.10_pers_critical"), - list(npast = "X5.10past_pers_dependable", nfut = "X5.10fut_pers_dependable", target = "ehi5.10_pers_dependable"), - list(npast = "X5.10past_pers_anxious", nfut = "X5.10fut_pers_anxious", target = "ehi5.10_pers_anxious"), - list(npast = "X5.10past_pers_complex", nfut = "X5.10fut_pers_complex", target = "ehi5.10_pers_complex"), - list(npast = "X5.10past_val_obey", nfut = "X5.10fut_val_obey", target = "ehi5.10_val_obey"), - list(npast = "X5.10past_val_trad", nfut = "X5.10fut_val_trad", target = "ehi5.10_val_trad"), - list(npast = "X5.10past_val_opinion", nfut = "X5.10fut_val_opinion", target = "ehi5.10_val_opinion"), - list(npast = "X5.10past_val_performance", nfut = "X5.10fut_val_performance", target = "ehi5.10_val_performance"), - list(npast = "X5.10past_val_justice", nfut = "X5.10fut_val_justice", target = "ehi5.10_val_justice") -) - -all_checks_passed <- TRUE - -for (pair in qa_pairs) { - # Calculate expected difference - expected_diff <- data[[pair$npast]] - data[[pair$nfut]] - - # Get actual value in target variable - actual_value <- data[[pair$target]] - - # Compare (allowing for floating point precision issues) - discrepancies <- which(abs(expected_diff - actual_value) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s\n", pair$target)) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - - # Show first discrepancy details - row_num <- discrepancies[1] - cat(sprintf(" Example (row %d): %s (%g) - %s (%g) = %g, but %s = %g\n", - row_num, - pair$npast, data[[pair$npast]][row_num], - pair$nfut, data[[pair$nfut]][row_num], - expected_diff[row_num], - pair$target, actual_value[row_num])) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s (n = %d)\n", pair$target, nrow(data))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(data, "eohi2.csv", row.names = FALSE) -cat("\nDataset saved to eohi2.csv\n") \ No newline at end of file diff --git a/.history/eohi2/dataP 11 - CORRECT ehi vars_20251008163817.r b/.history/eohi2/dataP 11 - CORRECT ehi vars_20251008163817.r deleted file mode 100644 index 96e4994..0000000 --- a/.history/eohi2/dataP 11 - CORRECT ehi vars_20251008163817.r +++ /dev/null @@ -1,235 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load data -data <- read.csv("eohi2.csv") - -# Create EHI difference variables (NPast - NFut) for different time intervals - -# === 5-YEAR PAST-FUTURE PAIRS === -# Preferences -data$ehi5_pref_read <- data$NPast_5_pref_read - data$NFut_5_pref_read -data$ehi5_pref_music <- data$NPast_5_pref_music - data$NFut_5_pref_music -data$ehi5_pref_TV <- data$NPast_5_pref_TV - data$NFut_5_pref_TV -data$ehi5_pref_nap <- data$NPast_5_pref_nap - data$NFut_5_pref_nap -data$ehi5_pref_travel <- data$NPast_5_pref_travel - data$NFut_5_pref_travel - -# Personality -data$ehi5_pers_extravert <- data$NPast_5_pers_extravert - data$NFut_5_pers_extravert -data$ehi5_pers_critical <- data$NPast_5_pers_critical - data$NFut_5_pers_critical -data$ehi5_pers_dependable <- data$NPast_5_pers_dependable - data$NFut_5_pers_dependable -data$ehi5_pers_anxious <- data$NPast_5_pers_anxious - data$NFut_5_pers_anxious -data$ehi5_pers_complex <- data$NPast_5_pers_complex - data$NFut_5_pers_complex - -# Values -data$ehi5_val_obey <- data$NPast_5_val_obey - data$NFut_5_val_obey -data$ehi5_val_trad <- data$NPast_5_val_trad - data$NFut_5_val_trad -data$ehi5_val_opinion <- data$NPast_5_val_opinion - data$NFut_5_val_opinion -data$ehi5_val_performance <- data$NPast_5_val_performance - data$NFut_5_val_performance -data$ehi5_val_justice <- data$NPast_5_val_justice - data$NFut_5_val_justice - -# === 10-YEAR PAST-FUTURE PAIRS === -# Preferences -data$ehi10_pref_read <- data$NPast_10_pref_read - data$NFut_10_pref_read -data$ehi10_pref_music <- data$NPast_10_pref_music - data$NFut_10_pref_music -data$ehi10_pref_TV <- data$NPast_10_pref_TV - data$NFut_10_pref_TV -data$ehi10_pref_nap <- data$NPast_10_pref_nap - data$NFut_10_pref_nap -data$ehi10_pref_travel <- data$NPast_10_pref_travel - data$NFut_10_pref_travel - -# Personality -data$ehi10_pers_extravert <- data$NPast_10_pers_extravert - data$NFut_10_pers_extravert -data$ehi10_pers_critical <- data$NPast_10_pers_critical - data$NFut_10_pers_critical -data$ehi10_pers_dependable <- data$NPast_10_pers_dependable - data$NFut_10_pers_dependable -data$ehi10_pers_anxious <- data$NPast_10_pers_anxious - data$NFut_10_pers_anxious -data$ehi10_pers_complex <- data$NPast_10_pers_complex - data$NFut_10_pers_complex - -# Values -data$ehi10_val_obey <- data$NPast_10_val_obey - data$NFut_10_val_obey -data$ehi10_val_trad <- data$NPast_10_val_trad - data$NFut_10_val_trad -data$ehi10_val_opinion <- data$NPast_10_val_opinion - data$NFut_10_val_opinion -data$ehi10_val_performance <- data$NPast_10_val_performance - data$NFut_10_val_performance -data$ehi10_val_justice <- data$NPast_10_val_justice - data$NFut_10_val_justice - -# === 5-10 YEAR CHANGE VARIABLES === -# Preferences -data$ehi5.10_pref_read <- data$X5.10past_pref_read - data$X5.10fut_pref_read -data$ehi5.10_pref_music <- data$X5.10past_pref_music - data$X5.10fut_pref_music -data$ehi5.10_pref_TV <- data$X5.10past_pref_TV - data$X5.10fut_pref_TV -data$ehi5.10_pref_nap <- data$X5.10past_pref_nap - data$X5.10fut_pref_nap -data$ehi5.10_pref_travel <- data$X5.10past_pref_travel - data$X5.10fut_pref_travel - -# Personality -data$ehi5.10_pers_extravert <- data$X5.10past_pers_extravert - data$X5.10fut_pers_extravert -data$ehi5.10_pers_critical <- data$X5.10past_pers_critical - data$X5.10fut_pers_critical -data$ehi5.10_pers_dependable <- data$X5.10past_pers_dependable - data$X5.10fut_pers_dependable -data$ehi5.10_pers_anxious <- data$X5.10past_pers_anxious - data$X5.10fut_pers_anxious -data$ehi5.10_pers_complex <- data$X5.10past_pers_complex - data$X5.10fut_pers_complex - -# Values -data$ehi5.10_val_obey <- data$X5.10past_val_obey - data$X5.10fut_val_obey -data$ehi5.10_val_trad <- data$X5.10past_val_trad - data$X5.10fut_val_trad -data$ehi5.10_val_opinion <- data$X5.10past_val_opinion - data$X5.10fut_val_opinion -data$ehi5.10_val_performance <- data$X5.10past_val_performance - data$X5.10fut_val_performance -data$ehi5.10_val_justice <- data$X5.10past_val_justice - data$X5.10fut_val_justice - -# QA: Verify calculations - FIRST 5 ROWS with detailed output -cat("\n=== QUALITY ASSURANCE CHECK - FIRST 5 ROWS ===\n\n") - -cat("--- 5-YEAR VARIABLES ---\n") -for (i in 1:5) { - cat(sprintf("\nRow %d:\n", i)) - cat(sprintf(" pref_read: %g - %g = %g | ehi5_pref_read = %g %s\n", - data$NPast_5_pref_read[i], data$NFut_5_pref_read[i], - data$NPast_5_pref_read[i] - data$NFut_5_pref_read[i], - data$ehi5_pref_read[i], - ifelse(abs((data$NPast_5_pref_read[i] - data$NFut_5_pref_read[i]) - data$ehi5_pref_read[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" pref_music: %g - %g = %g | ehi5_pref_music = %g %s\n", - data$NPast_5_pref_music[i], data$NFut_5_pref_music[i], - data$NPast_5_pref_music[i] - data$NFut_5_pref_music[i], - data$ehi5_pref_music[i], - ifelse(abs((data$NPast_5_pref_music[i] - data$NFut_5_pref_music[i]) - data$ehi5_pref_music[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" pers_extravert: %g - %g = %g | ehi5_pers_extravert = %g %s\n", - data$NPast_5_pers_extravert[i], data$NFut_5_pers_extravert[i], - data$NPast_5_pers_extravert[i] - data$NFut_5_pers_extravert[i], - data$ehi5_pers_extravert[i], - ifelse(abs((data$NPast_5_pers_extravert[i] - data$NFut_5_pers_extravert[i]) - data$ehi5_pers_extravert[i]) < 1e-10, "✓", "✗"))) -} - -cat("\n--- 10-YEAR VARIABLES ---\n") -for (i in 1:5) { - cat(sprintf("\nRow %d:\n", i)) - cat(sprintf(" pref_read: %g - %g = %g | ehi10_pref_read = %g %s\n", - data$NPast_10_pref_read[i], data$NFut_10_pref_read[i], - data$NPast_10_pref_read[i] - data$NFut_10_pref_read[i], - data$ehi10_pref_read[i], - ifelse(abs((data$NPast_10_pref_read[i] - data$NFut_10_pref_read[i]) - data$ehi10_pref_read[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" pref_music: %g - %g = %g | ehi10_pref_music = %g %s\n", - data$NPast_10_pref_music[i], data$NFut_10_pref_music[i], - data$NPast_10_pref_music[i] - data$NFut_10_pref_music[i], - data$ehi10_pref_music[i], - ifelse(abs((data$NPast_10_pref_music[i] - data$NFut_10_pref_music[i]) - data$ehi10_pref_music[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" pers_extravert: %g - %g = %g | ehi10_pers_extravert = %g %s\n", - data$NPast_10_pers_extravert[i], data$NFut_10_pers_extravert[i], - data$NPast_10_pers_extravert[i] - data$NFut_10_pers_extravert[i], - data$ehi10_pers_extravert[i], - ifelse(abs((data$NPast_10_pers_extravert[i] - data$NFut_10_pers_extravert[i]) - data$ehi10_pers_extravert[i]) < 1e-10, "✓", "✗"))) -} - -cat("\n--- 5-10 YEAR CHANGE VARIABLES ---\n") -for (i in 1:5) { - cat(sprintf("\nRow %d:\n", i)) - cat(sprintf(" pref_read: %g - %g = %g | ehi5.10_pref_read = %g %s\n", - data$X5.10past_pref_read[i], data$X5.10fut_pref_read[i], - data$X5.10past_pref_read[i] - data$X5.10fut_pref_read[i], - data$ehi5.10_pref_read[i], - ifelse(abs((data$X5.10past_pref_read[i] - data$X5.10fut_pref_read[i]) - data$ehi5.10_pref_read[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" pref_music: %g - %g = %g | ehi5.10_pref_music = %g %s\n", - data$X5.10past_pref_music[i], data$X5.10fut_pref_music[i], - data$X5.10past_pref_music[i] - data$X5.10fut_pref_music[i], - data$ehi5.10_pref_music[i], - ifelse(abs((data$X5.10past_pref_music[i] - data$X5.10fut_pref_music[i]) - data$ehi5.10_pref_music[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" pers_extravert: %g - %g = %g | ehi5.10_pers_extravert = %g %s\n", - data$X5.10past_pers_extravert[i], data$X5.10fut_pers_extravert[i], - data$X5.10past_pers_extravert[i] - data$X5.10fut_pers_extravert[i], - data$ehi5.10_pers_extravert[i], - ifelse(abs((data$X5.10past_pers_extravert[i] - data$X5.10fut_pers_extravert[i]) - data$ehi5.10_pers_extravert[i]) < 1e-10, "✓", "✗"))) -} - -# Full QA check for all rows and all variables -cat("\n\n=== OVERALL QA CHECK (ALL ROWS, ALL VARIABLES) ===\n") - -qa_pairs <- list( - # 5-year pairs - list(npast = "NPast_5_pref_read", nfut = "NFut_5_pref_read", target = "ehi5_pref_read"), - list(npast = "NPast_5_pref_music", nfut = "NFut_5_pref_music", target = "ehi5_pref_music"), - list(npast = "NPast_5_pref_TV", nfut = "NFut_5_pref_TV", target = "ehi5_pref_TV"), - list(npast = "NPast_5_pref_nap", nfut = "NFut_5_pref_nap", target = "ehi5_pref_nap"), - list(npast = "NPast_5_pref_travel", nfut = "NFut_5_pref_travel", target = "ehi5_pref_travel"), - list(npast = "NPast_5_pers_extravert", nfut = "NFut_5_pers_extravert", target = "ehi5_pers_extravert"), - list(npast = "NPast_5_pers_critical", nfut = "NFut_5_pers_critical", target = "ehi5_pers_critical"), - list(npast = "NPast_5_pers_dependable", nfut = "NFut_5_pers_dependable", target = "ehi5_pers_dependable"), - list(npast = "NPast_5_pers_anxious", nfut = "NFut_5_pers_anxious", target = "ehi5_pers_anxious"), - list(npast = "NPast_5_pers_complex", nfut = "NFut_5_pers_complex", target = "ehi5_pers_complex"), - list(npast = "NPast_5_val_obey", nfut = "NFut_5_val_obey", target = "ehi5_val_obey"), - list(npast = "NPast_5_val_trad", nfut = "NFut_5_val_trad", target = "ehi5_val_trad"), - list(npast = "NPast_5_val_opinion", nfut = "NFut_5_val_opinion", target = "ehi5_val_opinion"), - list(npast = "NPast_5_val_performance", nfut = "NFut_5_val_performance", target = "ehi5_val_performance"), - list(npast = "NPast_5_val_justice", nfut = "NFut_5_val_justice", target = "ehi5_val_justice"), - - # 10-year pairs - list(npast = "NPast_10_pref_read", nfut = "NFut_10_pref_read", target = "ehi10_pref_read"), - list(npast = "NPast_10_pref_music", nfut = "NFut_10_pref_music", target = "ehi10_pref_music"), - list(npast = "NPast_10_pref_TV", nfut = "NFut_10_pref_TV", target = "ehi10_pref_TV"), - list(npast = "NPast_10_pref_nap", nfut = "NFut_10_pref_nap", target = "ehi10_pref_nap"), - list(npast = "NPast_10_pref_travel", nfut = "NFut_10_pref_travel", target = "ehi10_pref_travel"), - list(npast = "NPast_10_pers_extravert", nfut = "NFut_10_pers_extravert", target = "ehi10_pers_extravert"), - list(npast = "NPast_10_pers_critical", nfut = "NFut_10_pers_critical", target = "ehi10_pers_critical"), - list(npast = "NPast_10_pers_dependable", nfut = "NFut_10_pers_dependable", target = "ehi10_pers_dependable"), - list(npast = "NPast_10_pers_anxious", nfut = "NFut_10_pers_anxious", target = "ehi10_pers_anxious"), - list(npast = "NPast_10_pers_complex", nfut = "NFut_10_pers_complex", target = "ehi10_pers_complex"), - list(npast = "NPast_10_val_obey", nfut = "NFut_10_val_obey", target = "ehi10_val_obey"), - list(npast = "NPast_10_val_trad", nfut = "NFut_10_val_trad", target = "ehi10_val_trad"), - list(npast = "NPast_10_val_opinion", nfut = "NFut_10_val_opinion", target = "ehi10_val_opinion"), - list(npast = "NPast_10_val_performance", nfut = "NFut_10_val_performance", target = "ehi10_val_performance"), - list(npast = "NPast_10_val_justice", nfut = "NFut_10_val_justice", target = "ehi10_val_justice"), - - # 5-10 year change pairs - list(npast = "X5.10past_pref_read", nfut = "X5.10fut_pref_read", target = "ehi5.10_pref_read"), - list(npast = "X5.10past_pref_music", nfut = "X5.10fut_pref_music", target = "ehi5.10_pref_music"), - list(npast = "X5.10past_pref_TV", nfut = "X5.10fut_pref_TV", target = "ehi5.10_pref_TV"), - list(npast = "X5.10past_pref_nap", nfut = "X5.10fut_pref_nap", target = "ehi5.10_pref_nap"), - list(npast = "X5.10past_pref_travel", nfut = "X5.10fut_pref_travel", target = "ehi5.10_pref_travel"), - list(npast = "X5.10past_pers_extravert", nfut = "X5.10fut_pers_extravert", target = "ehi5.10_pers_extravert"), - list(npast = "X5.10past_pers_critical", nfut = "X5.10fut_pers_critical", target = "ehi5.10_pers_critical"), - list(npast = "X5.10past_pers_dependable", nfut = "X5.10fut_pers_dependable", target = "ehi5.10_pers_dependable"), - list(npast = "X5.10past_pers_anxious", nfut = "X5.10fut_pers_anxious", target = "ehi5.10_pers_anxious"), - list(npast = "X5.10past_pers_complex", nfut = "X5.10fut_pers_complex", target = "ehi5.10_pers_complex"), - list(npast = "X5.10past_val_obey", nfut = "X5.10fut_val_obey", target = "ehi5.10_val_obey"), - list(npast = "X5.10past_val_trad", nfut = "X5.10fut_val_trad", target = "ehi5.10_val_trad"), - list(npast = "X5.10past_val_opinion", nfut = "X5.10fut_val_opinion", target = "ehi5.10_val_opinion"), - list(npast = "X5.10past_val_performance", nfut = "X5.10fut_val_performance", target = "ehi5.10_val_performance"), - list(npast = "X5.10past_val_justice", nfut = "X5.10fut_val_justice", target = "ehi5.10_val_justice") -) - -all_checks_passed <- TRUE - -for (pair in qa_pairs) { - # Calculate expected difference - expected_diff <- data[[pair$npast]] - data[[pair$nfut]] - - # Get actual value in target variable - actual_value <- data[[pair$target]] - - # Compare (allowing for floating point precision issues) - discrepancies <- which(abs(expected_diff - actual_value) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s\n", pair$target)) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - - # Show first discrepancy details - row_num <- discrepancies[1] - cat(sprintf(" Example (row %d): %s (%g) - %s (%g) = %g, but %s = %g\n", - row_num, - pair$npast, data[[pair$npast]][row_num], - pair$nfut, data[[pair$nfut]][row_num], - expected_diff[row_num], - pair$target, actual_value[row_num])) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s (n = %d)\n", pair$target, nrow(data))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(data, "eohi2.csv", row.names = FALSE) -cat("\nDataset saved to eohi2.csv\n") \ No newline at end of file diff --git a/.history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164446.r b/.history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164446.r deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164447.r b/.history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164447.r deleted file mode 100644 index 96e4994..0000000 --- a/.history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164447.r +++ /dev/null @@ -1,235 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load data -data <- read.csv("eohi2.csv") - -# Create EHI difference variables (NPast - NFut) for different time intervals - -# === 5-YEAR PAST-FUTURE PAIRS === -# Preferences -data$ehi5_pref_read <- data$NPast_5_pref_read - data$NFut_5_pref_read -data$ehi5_pref_music <- data$NPast_5_pref_music - data$NFut_5_pref_music -data$ehi5_pref_TV <- data$NPast_5_pref_TV - data$NFut_5_pref_TV -data$ehi5_pref_nap <- data$NPast_5_pref_nap - data$NFut_5_pref_nap -data$ehi5_pref_travel <- data$NPast_5_pref_travel - data$NFut_5_pref_travel - -# Personality -data$ehi5_pers_extravert <- data$NPast_5_pers_extravert - data$NFut_5_pers_extravert -data$ehi5_pers_critical <- data$NPast_5_pers_critical - data$NFut_5_pers_critical -data$ehi5_pers_dependable <- data$NPast_5_pers_dependable - data$NFut_5_pers_dependable -data$ehi5_pers_anxious <- data$NPast_5_pers_anxious - data$NFut_5_pers_anxious -data$ehi5_pers_complex <- data$NPast_5_pers_complex - data$NFut_5_pers_complex - -# Values -data$ehi5_val_obey <- data$NPast_5_val_obey - data$NFut_5_val_obey -data$ehi5_val_trad <- data$NPast_5_val_trad - data$NFut_5_val_trad -data$ehi5_val_opinion <- data$NPast_5_val_opinion - data$NFut_5_val_opinion -data$ehi5_val_performance <- data$NPast_5_val_performance - data$NFut_5_val_performance -data$ehi5_val_justice <- data$NPast_5_val_justice - data$NFut_5_val_justice - -# === 10-YEAR PAST-FUTURE PAIRS === -# Preferences -data$ehi10_pref_read <- data$NPast_10_pref_read - data$NFut_10_pref_read -data$ehi10_pref_music <- data$NPast_10_pref_music - data$NFut_10_pref_music -data$ehi10_pref_TV <- data$NPast_10_pref_TV - data$NFut_10_pref_TV -data$ehi10_pref_nap <- data$NPast_10_pref_nap - data$NFut_10_pref_nap -data$ehi10_pref_travel <- data$NPast_10_pref_travel - data$NFut_10_pref_travel - -# Personality -data$ehi10_pers_extravert <- data$NPast_10_pers_extravert - data$NFut_10_pers_extravert -data$ehi10_pers_critical <- data$NPast_10_pers_critical - data$NFut_10_pers_critical -data$ehi10_pers_dependable <- data$NPast_10_pers_dependable - data$NFut_10_pers_dependable -data$ehi10_pers_anxious <- data$NPast_10_pers_anxious - data$NFut_10_pers_anxious -data$ehi10_pers_complex <- data$NPast_10_pers_complex - data$NFut_10_pers_complex - -# Values -data$ehi10_val_obey <- data$NPast_10_val_obey - data$NFut_10_val_obey -data$ehi10_val_trad <- data$NPast_10_val_trad - data$NFut_10_val_trad -data$ehi10_val_opinion <- data$NPast_10_val_opinion - data$NFut_10_val_opinion -data$ehi10_val_performance <- data$NPast_10_val_performance - data$NFut_10_val_performance -data$ehi10_val_justice <- data$NPast_10_val_justice - data$NFut_10_val_justice - -# === 5-10 YEAR CHANGE VARIABLES === -# Preferences -data$ehi5.10_pref_read <- data$X5.10past_pref_read - data$X5.10fut_pref_read -data$ehi5.10_pref_music <- data$X5.10past_pref_music - data$X5.10fut_pref_music -data$ehi5.10_pref_TV <- data$X5.10past_pref_TV - data$X5.10fut_pref_TV -data$ehi5.10_pref_nap <- data$X5.10past_pref_nap - data$X5.10fut_pref_nap -data$ehi5.10_pref_travel <- data$X5.10past_pref_travel - data$X5.10fut_pref_travel - -# Personality -data$ehi5.10_pers_extravert <- data$X5.10past_pers_extravert - data$X5.10fut_pers_extravert -data$ehi5.10_pers_critical <- data$X5.10past_pers_critical - data$X5.10fut_pers_critical -data$ehi5.10_pers_dependable <- data$X5.10past_pers_dependable - data$X5.10fut_pers_dependable -data$ehi5.10_pers_anxious <- data$X5.10past_pers_anxious - data$X5.10fut_pers_anxious -data$ehi5.10_pers_complex <- data$X5.10past_pers_complex - data$X5.10fut_pers_complex - -# Values -data$ehi5.10_val_obey <- data$X5.10past_val_obey - data$X5.10fut_val_obey -data$ehi5.10_val_trad <- data$X5.10past_val_trad - data$X5.10fut_val_trad -data$ehi5.10_val_opinion <- data$X5.10past_val_opinion - data$X5.10fut_val_opinion -data$ehi5.10_val_performance <- data$X5.10past_val_performance - data$X5.10fut_val_performance -data$ehi5.10_val_justice <- data$X5.10past_val_justice - data$X5.10fut_val_justice - -# QA: Verify calculations - FIRST 5 ROWS with detailed output -cat("\n=== QUALITY ASSURANCE CHECK - FIRST 5 ROWS ===\n\n") - -cat("--- 5-YEAR VARIABLES ---\n") -for (i in 1:5) { - cat(sprintf("\nRow %d:\n", i)) - cat(sprintf(" pref_read: %g - %g = %g | ehi5_pref_read = %g %s\n", - data$NPast_5_pref_read[i], data$NFut_5_pref_read[i], - data$NPast_5_pref_read[i] - data$NFut_5_pref_read[i], - data$ehi5_pref_read[i], - ifelse(abs((data$NPast_5_pref_read[i] - data$NFut_5_pref_read[i]) - data$ehi5_pref_read[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" pref_music: %g - %g = %g | ehi5_pref_music = %g %s\n", - data$NPast_5_pref_music[i], data$NFut_5_pref_music[i], - data$NPast_5_pref_music[i] - data$NFut_5_pref_music[i], - data$ehi5_pref_music[i], - ifelse(abs((data$NPast_5_pref_music[i] - data$NFut_5_pref_music[i]) - data$ehi5_pref_music[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" pers_extravert: %g - %g = %g | ehi5_pers_extravert = %g %s\n", - data$NPast_5_pers_extravert[i], data$NFut_5_pers_extravert[i], - data$NPast_5_pers_extravert[i] - data$NFut_5_pers_extravert[i], - data$ehi5_pers_extravert[i], - ifelse(abs((data$NPast_5_pers_extravert[i] - data$NFut_5_pers_extravert[i]) - data$ehi5_pers_extravert[i]) < 1e-10, "✓", "✗"))) -} - -cat("\n--- 10-YEAR VARIABLES ---\n") -for (i in 1:5) { - cat(sprintf("\nRow %d:\n", i)) - cat(sprintf(" pref_read: %g - %g = %g | ehi10_pref_read = %g %s\n", - data$NPast_10_pref_read[i], data$NFut_10_pref_read[i], - data$NPast_10_pref_read[i] - data$NFut_10_pref_read[i], - data$ehi10_pref_read[i], - ifelse(abs((data$NPast_10_pref_read[i] - data$NFut_10_pref_read[i]) - data$ehi10_pref_read[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" pref_music: %g - %g = %g | ehi10_pref_music = %g %s\n", - data$NPast_10_pref_music[i], data$NFut_10_pref_music[i], - data$NPast_10_pref_music[i] - data$NFut_10_pref_music[i], - data$ehi10_pref_music[i], - ifelse(abs((data$NPast_10_pref_music[i] - data$NFut_10_pref_music[i]) - data$ehi10_pref_music[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" pers_extravert: %g - %g = %g | ehi10_pers_extravert = %g %s\n", - data$NPast_10_pers_extravert[i], data$NFut_10_pers_extravert[i], - data$NPast_10_pers_extravert[i] - data$NFut_10_pers_extravert[i], - data$ehi10_pers_extravert[i], - ifelse(abs((data$NPast_10_pers_extravert[i] - data$NFut_10_pers_extravert[i]) - data$ehi10_pers_extravert[i]) < 1e-10, "✓", "✗"))) -} - -cat("\n--- 5-10 YEAR CHANGE VARIABLES ---\n") -for (i in 1:5) { - cat(sprintf("\nRow %d:\n", i)) - cat(sprintf(" pref_read: %g - %g = %g | ehi5.10_pref_read = %g %s\n", - data$X5.10past_pref_read[i], data$X5.10fut_pref_read[i], - data$X5.10past_pref_read[i] - data$X5.10fut_pref_read[i], - data$ehi5.10_pref_read[i], - ifelse(abs((data$X5.10past_pref_read[i] - data$X5.10fut_pref_read[i]) - data$ehi5.10_pref_read[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" pref_music: %g - %g = %g | ehi5.10_pref_music = %g %s\n", - data$X5.10past_pref_music[i], data$X5.10fut_pref_music[i], - data$X5.10past_pref_music[i] - data$X5.10fut_pref_music[i], - data$ehi5.10_pref_music[i], - ifelse(abs((data$X5.10past_pref_music[i] - data$X5.10fut_pref_music[i]) - data$ehi5.10_pref_music[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" pers_extravert: %g - %g = %g | ehi5.10_pers_extravert = %g %s\n", - data$X5.10past_pers_extravert[i], data$X5.10fut_pers_extravert[i], - data$X5.10past_pers_extravert[i] - data$X5.10fut_pers_extravert[i], - data$ehi5.10_pers_extravert[i], - ifelse(abs((data$X5.10past_pers_extravert[i] - data$X5.10fut_pers_extravert[i]) - data$ehi5.10_pers_extravert[i]) < 1e-10, "✓", "✗"))) -} - -# Full QA check for all rows and all variables -cat("\n\n=== OVERALL QA CHECK (ALL ROWS, ALL VARIABLES) ===\n") - -qa_pairs <- list( - # 5-year pairs - list(npast = "NPast_5_pref_read", nfut = "NFut_5_pref_read", target = "ehi5_pref_read"), - list(npast = "NPast_5_pref_music", nfut = "NFut_5_pref_music", target = "ehi5_pref_music"), - list(npast = "NPast_5_pref_TV", nfut = "NFut_5_pref_TV", target = "ehi5_pref_TV"), - list(npast = "NPast_5_pref_nap", nfut = "NFut_5_pref_nap", target = "ehi5_pref_nap"), - list(npast = "NPast_5_pref_travel", nfut = "NFut_5_pref_travel", target = "ehi5_pref_travel"), - list(npast = "NPast_5_pers_extravert", nfut = "NFut_5_pers_extravert", target = "ehi5_pers_extravert"), - list(npast = "NPast_5_pers_critical", nfut = "NFut_5_pers_critical", target = "ehi5_pers_critical"), - list(npast = "NPast_5_pers_dependable", nfut = "NFut_5_pers_dependable", target = "ehi5_pers_dependable"), - list(npast = "NPast_5_pers_anxious", nfut = "NFut_5_pers_anxious", target = "ehi5_pers_anxious"), - list(npast = "NPast_5_pers_complex", nfut = "NFut_5_pers_complex", target = "ehi5_pers_complex"), - list(npast = "NPast_5_val_obey", nfut = "NFut_5_val_obey", target = "ehi5_val_obey"), - list(npast = "NPast_5_val_trad", nfut = "NFut_5_val_trad", target = "ehi5_val_trad"), - list(npast = "NPast_5_val_opinion", nfut = "NFut_5_val_opinion", target = "ehi5_val_opinion"), - list(npast = "NPast_5_val_performance", nfut = "NFut_5_val_performance", target = "ehi5_val_performance"), - list(npast = "NPast_5_val_justice", nfut = "NFut_5_val_justice", target = "ehi5_val_justice"), - - # 10-year pairs - list(npast = "NPast_10_pref_read", nfut = "NFut_10_pref_read", target = "ehi10_pref_read"), - list(npast = "NPast_10_pref_music", nfut = "NFut_10_pref_music", target = "ehi10_pref_music"), - list(npast = "NPast_10_pref_TV", nfut = "NFut_10_pref_TV", target = "ehi10_pref_TV"), - list(npast = "NPast_10_pref_nap", nfut = "NFut_10_pref_nap", target = "ehi10_pref_nap"), - list(npast = "NPast_10_pref_travel", nfut = "NFut_10_pref_travel", target = "ehi10_pref_travel"), - list(npast = "NPast_10_pers_extravert", nfut = "NFut_10_pers_extravert", target = "ehi10_pers_extravert"), - list(npast = "NPast_10_pers_critical", nfut = "NFut_10_pers_critical", target = "ehi10_pers_critical"), - list(npast = "NPast_10_pers_dependable", nfut = "NFut_10_pers_dependable", target = "ehi10_pers_dependable"), - list(npast = "NPast_10_pers_anxious", nfut = "NFut_10_pers_anxious", target = "ehi10_pers_anxious"), - list(npast = "NPast_10_pers_complex", nfut = "NFut_10_pers_complex", target = "ehi10_pers_complex"), - list(npast = "NPast_10_val_obey", nfut = "NFut_10_val_obey", target = "ehi10_val_obey"), - list(npast = "NPast_10_val_trad", nfut = "NFut_10_val_trad", target = "ehi10_val_trad"), - list(npast = "NPast_10_val_opinion", nfut = "NFut_10_val_opinion", target = "ehi10_val_opinion"), - list(npast = "NPast_10_val_performance", nfut = "NFut_10_val_performance", target = "ehi10_val_performance"), - list(npast = "NPast_10_val_justice", nfut = "NFut_10_val_justice", target = "ehi10_val_justice"), - - # 5-10 year change pairs - list(npast = "X5.10past_pref_read", nfut = "X5.10fut_pref_read", target = "ehi5.10_pref_read"), - list(npast = "X5.10past_pref_music", nfut = "X5.10fut_pref_music", target = "ehi5.10_pref_music"), - list(npast = "X5.10past_pref_TV", nfut = "X5.10fut_pref_TV", target = "ehi5.10_pref_TV"), - list(npast = "X5.10past_pref_nap", nfut = "X5.10fut_pref_nap", target = "ehi5.10_pref_nap"), - list(npast = "X5.10past_pref_travel", nfut = "X5.10fut_pref_travel", target = "ehi5.10_pref_travel"), - list(npast = "X5.10past_pers_extravert", nfut = "X5.10fut_pers_extravert", target = "ehi5.10_pers_extravert"), - list(npast = "X5.10past_pers_critical", nfut = "X5.10fut_pers_critical", target = "ehi5.10_pers_critical"), - list(npast = "X5.10past_pers_dependable", nfut = "X5.10fut_pers_dependable", target = "ehi5.10_pers_dependable"), - list(npast = "X5.10past_pers_anxious", nfut = "X5.10fut_pers_anxious", target = "ehi5.10_pers_anxious"), - list(npast = "X5.10past_pers_complex", nfut = "X5.10fut_pers_complex", target = "ehi5.10_pers_complex"), - list(npast = "X5.10past_val_obey", nfut = "X5.10fut_val_obey", target = "ehi5.10_val_obey"), - list(npast = "X5.10past_val_trad", nfut = "X5.10fut_val_trad", target = "ehi5.10_val_trad"), - list(npast = "X5.10past_val_opinion", nfut = "X5.10fut_val_opinion", target = "ehi5.10_val_opinion"), - list(npast = "X5.10past_val_performance", nfut = "X5.10fut_val_performance", target = "ehi5.10_val_performance"), - list(npast = "X5.10past_val_justice", nfut = "X5.10fut_val_justice", target = "ehi5.10_val_justice") -) - -all_checks_passed <- TRUE - -for (pair in qa_pairs) { - # Calculate expected difference - expected_diff <- data[[pair$npast]] - data[[pair$nfut]] - - # Get actual value in target variable - actual_value <- data[[pair$target]] - - # Compare (allowing for floating point precision issues) - discrepancies <- which(abs(expected_diff - actual_value) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s\n", pair$target)) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - - # Show first discrepancy details - row_num <- discrepancies[1] - cat(sprintf(" Example (row %d): %s (%g) - %s (%g) = %g, but %s = %g\n", - row_num, - pair$npast, data[[pair$npast]][row_num], - pair$nfut, data[[pair$nfut]][row_num], - expected_diff[row_num], - pair$target, actual_value[row_num])) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s (n = %d)\n", pair$target, nrow(data))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(data, "eohi2.csv", row.names = FALSE) -cat("\nDataset saved to eohi2.csv\n") \ No newline at end of file diff --git a/.history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164616.r b/.history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164616.r deleted file mode 100644 index 921f701..0000000 --- a/.history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164616.r +++ /dev/null @@ -1,179 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load data -data <- read.csv("eohi2.csv") - -# Create DGEN EHI difference variables (Past - Future) for different time intervals - -# === 5-YEAR DGEN PAST-FUTURE PAIRS === -data$ehiDGEN_5_Pref <- data$DGEN_past_5_Pref - data$DGEN_fut_5_Pref -data$ehiDGEN_5_Pers <- data$DGEN_past_5_Pers - data$DGEN_fut_5_Pers -data$ehiDGEN_5_Val <- data$DGEN_past_5_Val - data$DGEN_fut_5_Val - -# === 10-YEAR DGEN PAST-FUTURE PAIRS === -data$ehiDGEN_10_Pref <- data$DGEN_past_10_Pref - data$DGEN_fut_10_Pref -data$ehiDGEN_10_Pers <- data$DGEN_past_10_Pers - data$DGEN_fut_10_Pers -data$ehiDGEN_10_Val <- data$DGEN_past_10_Val - data$DGEN_fut_10_Val - -# QA: Verify calculations - FIRST 5 ROWS with detailed output -cat("\n=== QUALITY ASSURANCE CHECK - FIRST 5 ROWS ===\n\n") - -cat("--- 5-YEAR VARIABLES ---\n") -for (i in 1:5) { - cat(sprintf("\nRow %d:\n", i)) - cat(sprintf(" pref_read: %g - %g = %g | ehi5_pref_read = %g %s\n", - data$NPast_5_pref_read[i], data$NFut_5_pref_read[i], - data$NPast_5_pref_read[i] - data$NFut_5_pref_read[i], - data$ehi5_pref_read[i], - ifelse(abs((data$NPast_5_pref_read[i] - data$NFut_5_pref_read[i]) - data$ehi5_pref_read[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" pref_music: %g - %g = %g | ehi5_pref_music = %g %s\n", - data$NPast_5_pref_music[i], data$NFut_5_pref_music[i], - data$NPast_5_pref_music[i] - data$NFut_5_pref_music[i], - data$ehi5_pref_music[i], - ifelse(abs((data$NPast_5_pref_music[i] - data$NFut_5_pref_music[i]) - data$ehi5_pref_music[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" pers_extravert: %g - %g = %g | ehi5_pers_extravert = %g %s\n", - data$NPast_5_pers_extravert[i], data$NFut_5_pers_extravert[i], - data$NPast_5_pers_extravert[i] - data$NFut_5_pers_extravert[i], - data$ehi5_pers_extravert[i], - ifelse(abs((data$NPast_5_pers_extravert[i] - data$NFut_5_pers_extravert[i]) - data$ehi5_pers_extravert[i]) < 1e-10, "✓", "✗"))) -} - -cat("\n--- 10-YEAR VARIABLES ---\n") -for (i in 1:5) { - cat(sprintf("\nRow %d:\n", i)) - cat(sprintf(" pref_read: %g - %g = %g | ehi10_pref_read = %g %s\n", - data$NPast_10_pref_read[i], data$NFut_10_pref_read[i], - data$NPast_10_pref_read[i] - data$NFut_10_pref_read[i], - data$ehi10_pref_read[i], - ifelse(abs((data$NPast_10_pref_read[i] - data$NFut_10_pref_read[i]) - data$ehi10_pref_read[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" pref_music: %g - %g = %g | ehi10_pref_music = %g %s\n", - data$NPast_10_pref_music[i], data$NFut_10_pref_music[i], - data$NPast_10_pref_music[i] - data$NFut_10_pref_music[i], - data$ehi10_pref_music[i], - ifelse(abs((data$NPast_10_pref_music[i] - data$NFut_10_pref_music[i]) - data$ehi10_pref_music[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" pers_extravert: %g - %g = %g | ehi10_pers_extravert = %g %s\n", - data$NPast_10_pers_extravert[i], data$NFut_10_pers_extravert[i], - data$NPast_10_pers_extravert[i] - data$NFut_10_pers_extravert[i], - data$ehi10_pers_extravert[i], - ifelse(abs((data$NPast_10_pers_extravert[i] - data$NFut_10_pers_extravert[i]) - data$ehi10_pers_extravert[i]) < 1e-10, "✓", "✗"))) -} - -cat("\n--- 5-10 YEAR CHANGE VARIABLES ---\n") -for (i in 1:5) { - cat(sprintf("\nRow %d:\n", i)) - cat(sprintf(" pref_read: %g - %g = %g | ehi5.10_pref_read = %g %s\n", - data$X5.10past_pref_read[i], data$X5.10fut_pref_read[i], - data$X5.10past_pref_read[i] - data$X5.10fut_pref_read[i], - data$ehi5.10_pref_read[i], - ifelse(abs((data$X5.10past_pref_read[i] - data$X5.10fut_pref_read[i]) - data$ehi5.10_pref_read[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" pref_music: %g - %g = %g | ehi5.10_pref_music = %g %s\n", - data$X5.10past_pref_music[i], data$X5.10fut_pref_music[i], - data$X5.10past_pref_music[i] - data$X5.10fut_pref_music[i], - data$ehi5.10_pref_music[i], - ifelse(abs((data$X5.10past_pref_music[i] - data$X5.10fut_pref_music[i]) - data$ehi5.10_pref_music[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" pers_extravert: %g - %g = %g | ehi5.10_pers_extravert = %g %s\n", - data$X5.10past_pers_extravert[i], data$X5.10fut_pers_extravert[i], - data$X5.10past_pers_extravert[i] - data$X5.10fut_pers_extravert[i], - data$ehi5.10_pers_extravert[i], - ifelse(abs((data$X5.10past_pers_extravert[i] - data$X5.10fut_pers_extravert[i]) - data$ehi5.10_pers_extravert[i]) < 1e-10, "✓", "✗"))) -} - -# Full QA check for all rows and all variables -cat("\n\n=== OVERALL QA CHECK (ALL ROWS, ALL VARIABLES) ===\n") - -qa_pairs <- list( - # 5-year pairs - list(npast = "NPast_5_pref_read", nfut = "NFut_5_pref_read", target = "ehi5_pref_read"), - list(npast = "NPast_5_pref_music", nfut = "NFut_5_pref_music", target = "ehi5_pref_music"), - list(npast = "NPast_5_pref_TV", nfut = "NFut_5_pref_TV", target = "ehi5_pref_TV"), - list(npast = "NPast_5_pref_nap", nfut = "NFut_5_pref_nap", target = "ehi5_pref_nap"), - list(npast = "NPast_5_pref_travel", nfut = "NFut_5_pref_travel", target = "ehi5_pref_travel"), - list(npast = "NPast_5_pers_extravert", nfut = "NFut_5_pers_extravert", target = "ehi5_pers_extravert"), - list(npast = "NPast_5_pers_critical", nfut = "NFut_5_pers_critical", target = "ehi5_pers_critical"), - list(npast = "NPast_5_pers_dependable", nfut = "NFut_5_pers_dependable", target = "ehi5_pers_dependable"), - list(npast = "NPast_5_pers_anxious", nfut = "NFut_5_pers_anxious", target = "ehi5_pers_anxious"), - list(npast = "NPast_5_pers_complex", nfut = "NFut_5_pers_complex", target = "ehi5_pers_complex"), - list(npast = "NPast_5_val_obey", nfut = "NFut_5_val_obey", target = "ehi5_val_obey"), - list(npast = "NPast_5_val_trad", nfut = "NFut_5_val_trad", target = "ehi5_val_trad"), - list(npast = "NPast_5_val_opinion", nfut = "NFut_5_val_opinion", target = "ehi5_val_opinion"), - list(npast = "NPast_5_val_performance", nfut = "NFut_5_val_performance", target = "ehi5_val_performance"), - list(npast = "NPast_5_val_justice", nfut = "NFut_5_val_justice", target = "ehi5_val_justice"), - - # 10-year pairs - list(npast = "NPast_10_pref_read", nfut = "NFut_10_pref_read", target = "ehi10_pref_read"), - list(npast = "NPast_10_pref_music", nfut = "NFut_10_pref_music", target = "ehi10_pref_music"), - list(npast = "NPast_10_pref_TV", nfut = "NFut_10_pref_TV", target = "ehi10_pref_TV"), - list(npast = "NPast_10_pref_nap", nfut = "NFut_10_pref_nap", target = "ehi10_pref_nap"), - list(npast = "NPast_10_pref_travel", nfut = "NFut_10_pref_travel", target = "ehi10_pref_travel"), - list(npast = "NPast_10_pers_extravert", nfut = "NFut_10_pers_extravert", target = "ehi10_pers_extravert"), - list(npast = "NPast_10_pers_critical", nfut = "NFut_10_pers_critical", target = "ehi10_pers_critical"), - list(npast = "NPast_10_pers_dependable", nfut = "NFut_10_pers_dependable", target = "ehi10_pers_dependable"), - list(npast = "NPast_10_pers_anxious", nfut = "NFut_10_pers_anxious", target = "ehi10_pers_anxious"), - list(npast = "NPast_10_pers_complex", nfut = "NFut_10_pers_complex", target = "ehi10_pers_complex"), - list(npast = "NPast_10_val_obey", nfut = "NFut_10_val_obey", target = "ehi10_val_obey"), - list(npast = "NPast_10_val_trad", nfut = "NFut_10_val_trad", target = "ehi10_val_trad"), - list(npast = "NPast_10_val_opinion", nfut = "NFut_10_val_opinion", target = "ehi10_val_opinion"), - list(npast = "NPast_10_val_performance", nfut = "NFut_10_val_performance", target = "ehi10_val_performance"), - list(npast = "NPast_10_val_justice", nfut = "NFut_10_val_justice", target = "ehi10_val_justice"), - - # 5-10 year change pairs - list(npast = "X5.10past_pref_read", nfut = "X5.10fut_pref_read", target = "ehi5.10_pref_read"), - list(npast = "X5.10past_pref_music", nfut = "X5.10fut_pref_music", target = "ehi5.10_pref_music"), - list(npast = "X5.10past_pref_TV", nfut = "X5.10fut_pref_TV", target = "ehi5.10_pref_TV"), - list(npast = "X5.10past_pref_nap", nfut = "X5.10fut_pref_nap", target = "ehi5.10_pref_nap"), - list(npast = "X5.10past_pref_travel", nfut = "X5.10fut_pref_travel", target = "ehi5.10_pref_travel"), - list(npast = "X5.10past_pers_extravert", nfut = "X5.10fut_pers_extravert", target = "ehi5.10_pers_extravert"), - list(npast = "X5.10past_pers_critical", nfut = "X5.10fut_pers_critical", target = "ehi5.10_pers_critical"), - list(npast = "X5.10past_pers_dependable", nfut = "X5.10fut_pers_dependable", target = "ehi5.10_pers_dependable"), - list(npast = "X5.10past_pers_anxious", nfut = "X5.10fut_pers_anxious", target = "ehi5.10_pers_anxious"), - list(npast = "X5.10past_pers_complex", nfut = "X5.10fut_pers_complex", target = "ehi5.10_pers_complex"), - list(npast = "X5.10past_val_obey", nfut = "X5.10fut_val_obey", target = "ehi5.10_val_obey"), - list(npast = "X5.10past_val_trad", nfut = "X5.10fut_val_trad", target = "ehi5.10_val_trad"), - list(npast = "X5.10past_val_opinion", nfut = "X5.10fut_val_opinion", target = "ehi5.10_val_opinion"), - list(npast = "X5.10past_val_performance", nfut = "X5.10fut_val_performance", target = "ehi5.10_val_performance"), - list(npast = "X5.10past_val_justice", nfut = "X5.10fut_val_justice", target = "ehi5.10_val_justice") -) - -all_checks_passed <- TRUE - -for (pair in qa_pairs) { - # Calculate expected difference - expected_diff <- data[[pair$npast]] - data[[pair$nfut]] - - # Get actual value in target variable - actual_value <- data[[pair$target]] - - # Compare (allowing for floating point precision issues) - discrepancies <- which(abs(expected_diff - actual_value) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s\n", pair$target)) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - - # Show first discrepancy details - row_num <- discrepancies[1] - cat(sprintf(" Example (row %d): %s (%g) - %s (%g) = %g, but %s = %g\n", - row_num, - pair$npast, data[[pair$npast]][row_num], - pair$nfut, data[[pair$nfut]][row_num], - expected_diff[row_num], - pair$target, actual_value[row_num])) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s (n = %d)\n", pair$target, nrow(data))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(data, "eohi2.csv", row.names = FALSE) -cat("\nDataset saved to eohi2.csv\n") \ No newline at end of file diff --git a/.history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164646.r b/.history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164646.r deleted file mode 100644 index e569dee..0000000 --- a/.history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164646.r +++ /dev/null @@ -1,159 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load data -data <- read.csv("eohi2.csv") - -# Create DGEN EHI difference variables (Past - Future) for different time intervals - -# === 5-YEAR DGEN PAST-FUTURE PAIRS === -data$ehiDGEN_5_Pref <- data$DGEN_past_5_Pref - data$DGEN_fut_5_Pref -data$ehiDGEN_5_Pers <- data$DGEN_past_5_Pers - data$DGEN_fut_5_Pers -data$ehiDGEN_5_Val <- data$DGEN_past_5_Val - data$DGEN_fut_5_Val - -# === 10-YEAR DGEN PAST-FUTURE PAIRS === -data$ehiDGEN_10_Pref <- data$DGEN_past_10_Pref - data$DGEN_fut_10_Pref -data$ehiDGEN_10_Pers <- data$DGEN_past_10_Pers - data$DGEN_fut_10_Pers -data$ehiDGEN_10_Val <- data$DGEN_past_10_Val - data$DGEN_fut_10_Val - -# QA: Verify calculations - FIRST 5 ROWS with detailed output -cat("\n=== QUALITY ASSURANCE CHECK - FIRST 5 ROWS ===\n\n") - -cat("--- 5-YEAR DGEN VARIABLES ---\n") -for (i in 1:5) { - cat(sprintf("\nRow %d:\n", i)) - cat(sprintf(" Pref: %g - %g = %g | ehiDGEN_5_Pref = %g %s\n", - data$DGEN_past_5_Pref[i], data$DGEN_fut_5_Pref[i], - data$DGEN_past_5_Pref[i] - data$DGEN_fut_5_Pref[i], - data$ehiDGEN_5_Pref[i], - ifelse(abs((data$DGEN_past_5_Pref[i] - data$DGEN_fut_5_Pref[i]) - data$ehiDGEN_5_Pref[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" Pers: %g - %g = %g | ehiDGEN_5_Pers = %g %s\n", - data$DGEN_past_5_Pers[i], data$DGEN_fut_5_Pers[i], - data$DGEN_past_5_Pers[i] - data$DGEN_fut_5_Pers[i], - data$ehiDGEN_5_Pers[i], - ifelse(abs((data$DGEN_past_5_Pers[i] - data$DGEN_fut_5_Pers[i]) - data$ehiDGEN_5_Pers[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" Val: %g - %g = %g | ehiDGEN_5_Val = %g %s\n", - data$DGEN_past_5_Val[i], data$DGEN_fut_5_Val[i], - data$DGEN_past_5_Val[i] - data$DGEN_fut_5_Val[i], - data$ehiDGEN_5_Val[i], - ifelse(abs((data$DGEN_past_5_Val[i] - data$DGEN_fut_5_Val[i]) - data$ehiDGEN_5_Val[i]) < 1e-10, "✓", "✗"))) -} - -cat("\n--- 10-YEAR DGEN VARIABLES ---\n") -for (i in 1:5) { - cat(sprintf("\nRow %d:\n", i)) - cat(sprintf(" Pref: %g - %g = %g | ehiDGEN_10_Pref = %g %s\n", - data$DGEN_past_10_Pref[i], data$DGEN_fut_10_Pref[i], - data$DGEN_past_10_Pref[i] - data$DGEN_fut_10_Pref[i], - data$ehiDGEN_10_Pref[i], - ifelse(abs((data$DGEN_past_10_Pref[i] - data$DGEN_fut_10_Pref[i]) - data$ehiDGEN_10_Pref[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" Pers: %g - %g = %g | ehiDGEN_10_Pers = %g %s\n", - data$DGEN_past_10_Pers[i], data$DGEN_fut_10_Pers[i], - data$DGEN_past_10_Pers[i] - data$DGEN_fut_10_Pers[i], - data$ehiDGEN_10_Pers[i], - ifelse(abs((data$DGEN_past_10_Pers[i] - data$DGEN_fut_10_Pers[i]) - data$ehiDGEN_10_Pers[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" Val: %g - %g = %g | ehiDGEN_10_Val = %g %s\n", - data$DGEN_past_10_Val[i], data$DGEN_fut_10_Val[i], - data$DGEN_past_10_Val[i] - data$DGEN_fut_10_Val[i], - data$ehiDGEN_10_Val[i], - ifelse(abs((data$DGEN_past_10_Val[i] - data$DGEN_fut_10_Val[i]) - data$ehiDGEN_10_Val[i]) < 1e-10, "✓", "✗"))) -} - -# Full QA check for all rows and all variables -cat("\n\n=== OVERALL QA CHECK (ALL ROWS, ALL VARIABLES) ===\n") - -qa_pairs <- list( - # 5-year pairs - list(npast = "NPast_5_pref_read", nfut = "NFut_5_pref_read", target = "ehi5_pref_read"), - list(npast = "NPast_5_pref_music", nfut = "NFut_5_pref_music", target = "ehi5_pref_music"), - list(npast = "NPast_5_pref_TV", nfut = "NFut_5_pref_TV", target = "ehi5_pref_TV"), - list(npast = "NPast_5_pref_nap", nfut = "NFut_5_pref_nap", target = "ehi5_pref_nap"), - list(npast = "NPast_5_pref_travel", nfut = "NFut_5_pref_travel", target = "ehi5_pref_travel"), - list(npast = "NPast_5_pers_extravert", nfut = "NFut_5_pers_extravert", target = "ehi5_pers_extravert"), - list(npast = "NPast_5_pers_critical", nfut = "NFut_5_pers_critical", target = "ehi5_pers_critical"), - list(npast = "NPast_5_pers_dependable", nfut = "NFut_5_pers_dependable", target = "ehi5_pers_dependable"), - list(npast = "NPast_5_pers_anxious", nfut = "NFut_5_pers_anxious", target = "ehi5_pers_anxious"), - list(npast = "NPast_5_pers_complex", nfut = "NFut_5_pers_complex", target = "ehi5_pers_complex"), - list(npast = "NPast_5_val_obey", nfut = "NFut_5_val_obey", target = "ehi5_val_obey"), - list(npast = "NPast_5_val_trad", nfut = "NFut_5_val_trad", target = "ehi5_val_trad"), - list(npast = "NPast_5_val_opinion", nfut = "NFut_5_val_opinion", target = "ehi5_val_opinion"), - list(npast = "NPast_5_val_performance", nfut = "NFut_5_val_performance", target = "ehi5_val_performance"), - list(npast = "NPast_5_val_justice", nfut = "NFut_5_val_justice", target = "ehi5_val_justice"), - - # 10-year pairs - list(npast = "NPast_10_pref_read", nfut = "NFut_10_pref_read", target = "ehi10_pref_read"), - list(npast = "NPast_10_pref_music", nfut = "NFut_10_pref_music", target = "ehi10_pref_music"), - list(npast = "NPast_10_pref_TV", nfut = "NFut_10_pref_TV", target = "ehi10_pref_TV"), - list(npast = "NPast_10_pref_nap", nfut = "NFut_10_pref_nap", target = "ehi10_pref_nap"), - list(npast = "NPast_10_pref_travel", nfut = "NFut_10_pref_travel", target = "ehi10_pref_travel"), - list(npast = "NPast_10_pers_extravert", nfut = "NFut_10_pers_extravert", target = "ehi10_pers_extravert"), - list(npast = "NPast_10_pers_critical", nfut = "NFut_10_pers_critical", target = "ehi10_pers_critical"), - list(npast = "NPast_10_pers_dependable", nfut = "NFut_10_pers_dependable", target = "ehi10_pers_dependable"), - list(npast = "NPast_10_pers_anxious", nfut = "NFut_10_pers_anxious", target = "ehi10_pers_anxious"), - list(npast = "NPast_10_pers_complex", nfut = "NFut_10_pers_complex", target = "ehi10_pers_complex"), - list(npast = "NPast_10_val_obey", nfut = "NFut_10_val_obey", target = "ehi10_val_obey"), - list(npast = "NPast_10_val_trad", nfut = "NFut_10_val_trad", target = "ehi10_val_trad"), - list(npast = "NPast_10_val_opinion", nfut = "NFut_10_val_opinion", target = "ehi10_val_opinion"), - list(npast = "NPast_10_val_performance", nfut = "NFut_10_val_performance", target = "ehi10_val_performance"), - list(npast = "NPast_10_val_justice", nfut = "NFut_10_val_justice", target = "ehi10_val_justice"), - - # 5-10 year change pairs - list(npast = "X5.10past_pref_read", nfut = "X5.10fut_pref_read", target = "ehi5.10_pref_read"), - list(npast = "X5.10past_pref_music", nfut = "X5.10fut_pref_music", target = "ehi5.10_pref_music"), - list(npast = "X5.10past_pref_TV", nfut = "X5.10fut_pref_TV", target = "ehi5.10_pref_TV"), - list(npast = "X5.10past_pref_nap", nfut = "X5.10fut_pref_nap", target = "ehi5.10_pref_nap"), - list(npast = "X5.10past_pref_travel", nfut = "X5.10fut_pref_travel", target = "ehi5.10_pref_travel"), - list(npast = "X5.10past_pers_extravert", nfut = "X5.10fut_pers_extravert", target = "ehi5.10_pers_extravert"), - list(npast = "X5.10past_pers_critical", nfut = "X5.10fut_pers_critical", target = "ehi5.10_pers_critical"), - list(npast = "X5.10past_pers_dependable", nfut = "X5.10fut_pers_dependable", target = "ehi5.10_pers_dependable"), - list(npast = "X5.10past_pers_anxious", nfut = "X5.10fut_pers_anxious", target = "ehi5.10_pers_anxious"), - list(npast = "X5.10past_pers_complex", nfut = "X5.10fut_pers_complex", target = "ehi5.10_pers_complex"), - list(npast = "X5.10past_val_obey", nfut = "X5.10fut_val_obey", target = "ehi5.10_val_obey"), - list(npast = "X5.10past_val_trad", nfut = "X5.10fut_val_trad", target = "ehi5.10_val_trad"), - list(npast = "X5.10past_val_opinion", nfut = "X5.10fut_val_opinion", target = "ehi5.10_val_opinion"), - list(npast = "X5.10past_val_performance", nfut = "X5.10fut_val_performance", target = "ehi5.10_val_performance"), - list(npast = "X5.10past_val_justice", nfut = "X5.10fut_val_justice", target = "ehi5.10_val_justice") -) - -all_checks_passed <- TRUE - -for (pair in qa_pairs) { - # Calculate expected difference - expected_diff <- data[[pair$npast]] - data[[pair$nfut]] - - # Get actual value in target variable - actual_value <- data[[pair$target]] - - # Compare (allowing for floating point precision issues) - discrepancies <- which(abs(expected_diff - actual_value) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s\n", pair$target)) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - - # Show first discrepancy details - row_num <- discrepancies[1] - cat(sprintf(" Example (row %d): %s (%g) - %s (%g) = %g, but %s = %g\n", - row_num, - pair$npast, data[[pair$npast]][row_num], - pair$nfut, data[[pair$nfut]][row_num], - expected_diff[row_num], - pair$target, actual_value[row_num])) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s (n = %d)\n", pair$target, nrow(data))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(data, "eohi2.csv", row.names = FALSE) -cat("\nDataset saved to eohi2.csv\n") \ No newline at end of file diff --git a/.history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164712.r b/.history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164712.r deleted file mode 100644 index 5274cdd..0000000 --- a/.history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164712.r +++ /dev/null @@ -1,118 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load data -data <- read.csv("eohi2.csv") - -# Create DGEN EHI difference variables (Past - Future) for different time intervals - -# === 5-YEAR DGEN PAST-FUTURE PAIRS === -data$ehiDGEN_5_Pref <- data$DGEN_past_5_Pref - data$DGEN_fut_5_Pref -data$ehiDGEN_5_Pers <- data$DGEN_past_5_Pers - data$DGEN_fut_5_Pers -data$ehiDGEN_5_Val <- data$DGEN_past_5_Val - data$DGEN_fut_5_Val - -# === 10-YEAR DGEN PAST-FUTURE PAIRS === -data$ehiDGEN_10_Pref <- data$DGEN_past_10_Pref - data$DGEN_fut_10_Pref -data$ehiDGEN_10_Pers <- data$DGEN_past_10_Pers - data$DGEN_fut_10_Pers -data$ehiDGEN_10_Val <- data$DGEN_past_10_Val - data$DGEN_fut_10_Val - -# QA: Verify calculations - FIRST 5 ROWS with detailed output -cat("\n=== QUALITY ASSURANCE CHECK - FIRST 5 ROWS ===\n\n") - -cat("--- 5-YEAR DGEN VARIABLES ---\n") -for (i in 1:5) { - cat(sprintf("\nRow %d:\n", i)) - cat(sprintf(" Pref: %g - %g = %g | ehiDGEN_5_Pref = %g %s\n", - data$DGEN_past_5_Pref[i], data$DGEN_fut_5_Pref[i], - data$DGEN_past_5_Pref[i] - data$DGEN_fut_5_Pref[i], - data$ehiDGEN_5_Pref[i], - ifelse(abs((data$DGEN_past_5_Pref[i] - data$DGEN_fut_5_Pref[i]) - data$ehiDGEN_5_Pref[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" Pers: %g - %g = %g | ehiDGEN_5_Pers = %g %s\n", - data$DGEN_past_5_Pers[i], data$DGEN_fut_5_Pers[i], - data$DGEN_past_5_Pers[i] - data$DGEN_fut_5_Pers[i], - data$ehiDGEN_5_Pers[i], - ifelse(abs((data$DGEN_past_5_Pers[i] - data$DGEN_fut_5_Pers[i]) - data$ehiDGEN_5_Pers[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" Val: %g - %g = %g | ehiDGEN_5_Val = %g %s\n", - data$DGEN_past_5_Val[i], data$DGEN_fut_5_Val[i], - data$DGEN_past_5_Val[i] - data$DGEN_fut_5_Val[i], - data$ehiDGEN_5_Val[i], - ifelse(abs((data$DGEN_past_5_Val[i] - data$DGEN_fut_5_Val[i]) - data$ehiDGEN_5_Val[i]) < 1e-10, "✓", "✗"))) -} - -cat("\n--- 10-YEAR DGEN VARIABLES ---\n") -for (i in 1:5) { - cat(sprintf("\nRow %d:\n", i)) - cat(sprintf(" Pref: %g - %g = %g | ehiDGEN_10_Pref = %g %s\n", - data$DGEN_past_10_Pref[i], data$DGEN_fut_10_Pref[i], - data$DGEN_past_10_Pref[i] - data$DGEN_fut_10_Pref[i], - data$ehiDGEN_10_Pref[i], - ifelse(abs((data$DGEN_past_10_Pref[i] - data$DGEN_fut_10_Pref[i]) - data$ehiDGEN_10_Pref[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" Pers: %g - %g = %g | ehiDGEN_10_Pers = %g %s\n", - data$DGEN_past_10_Pers[i], data$DGEN_fut_10_Pers[i], - data$DGEN_past_10_Pers[i] - data$DGEN_fut_10_Pers[i], - data$ehiDGEN_10_Pers[i], - ifelse(abs((data$DGEN_past_10_Pers[i] - data$DGEN_fut_10_Pers[i]) - data$ehiDGEN_10_Pers[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" Val: %g - %g = %g | ehiDGEN_10_Val = %g %s\n", - data$DGEN_past_10_Val[i], data$DGEN_fut_10_Val[i], - data$DGEN_past_10_Val[i] - data$DGEN_fut_10_Val[i], - data$ehiDGEN_10_Val[i], - ifelse(abs((data$DGEN_past_10_Val[i] - data$DGEN_fut_10_Val[i]) - data$ehiDGEN_10_Val[i]) < 1e-10, "✓", "✗"))) -} - -# Full QA check for all rows and all variables -cat("\n\n=== OVERALL QA CHECK (ALL ROWS, ALL VARIABLES) ===\n") - -qa_pairs <- list( - # 5-year DGEN pairs - list(npast = "DGEN_past_5_Pref", nfut = "DGEN_fut_5_Pref", target = "ehiDGEN_5_Pref"), - list(npast = "DGEN_past_5_Pers", nfut = "DGEN_fut_5_Pers", target = "ehiDGEN_5_Pers"), - list(npast = "DGEN_past_5_Val", nfut = "DGEN_fut_5_Val", target = "ehiDGEN_5_Val"), - - # 10-year DGEN pairs - list(npast = "DGEN_past_10_Pref", nfut = "DGEN_fut_10_Pref", target = "ehiDGEN_10_Pref"), - list(npast = "DGEN_past_10_Pers", nfut = "DGEN_fut_10_Pers", target = "ehiDGEN_10_Pers"), - list(npast = "DGEN_past_10_Val", nfut = "DGEN_fut_10_Val", target = "ehiDGEN_10_Val") -) - -all_checks_passed <- TRUE - -for (pair in qa_pairs) { - # Calculate expected difference - expected_diff <- data[[pair$npast]] - data[[pair$nfut]] - - # Get actual value in target variable - actual_value <- data[[pair$target]] - - # Compare (allowing for floating point precision issues) - discrepancies <- which(abs(expected_diff - actual_value) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s\n", pair$target)) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - - # Show first discrepancy details - row_num <- discrepancies[1] - cat(sprintf(" Example (row %d): %s (%g) - %s (%g) = %g, but %s = %g\n", - row_num, - pair$npast, data[[pair$npast]][row_num], - pair$nfut, data[[pair$nfut]][row_num], - expected_diff[row_num], - pair$target, actual_value[row_num])) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s (n = %d)\n", pair$target, nrow(data))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(data, "eohi2.csv", row.names = FALSE) -cat("\nDataset saved to eohi2.csv\n") \ No newline at end of file diff --git a/.history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164735.r b/.history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164735.r deleted file mode 100644 index 5274cdd..0000000 --- a/.history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164735.r +++ /dev/null @@ -1,118 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load data -data <- read.csv("eohi2.csv") - -# Create DGEN EHI difference variables (Past - Future) for different time intervals - -# === 5-YEAR DGEN PAST-FUTURE PAIRS === -data$ehiDGEN_5_Pref <- data$DGEN_past_5_Pref - data$DGEN_fut_5_Pref -data$ehiDGEN_5_Pers <- data$DGEN_past_5_Pers - data$DGEN_fut_5_Pers -data$ehiDGEN_5_Val <- data$DGEN_past_5_Val - data$DGEN_fut_5_Val - -# === 10-YEAR DGEN PAST-FUTURE PAIRS === -data$ehiDGEN_10_Pref <- data$DGEN_past_10_Pref - data$DGEN_fut_10_Pref -data$ehiDGEN_10_Pers <- data$DGEN_past_10_Pers - data$DGEN_fut_10_Pers -data$ehiDGEN_10_Val <- data$DGEN_past_10_Val - data$DGEN_fut_10_Val - -# QA: Verify calculations - FIRST 5 ROWS with detailed output -cat("\n=== QUALITY ASSURANCE CHECK - FIRST 5 ROWS ===\n\n") - -cat("--- 5-YEAR DGEN VARIABLES ---\n") -for (i in 1:5) { - cat(sprintf("\nRow %d:\n", i)) - cat(sprintf(" Pref: %g - %g = %g | ehiDGEN_5_Pref = %g %s\n", - data$DGEN_past_5_Pref[i], data$DGEN_fut_5_Pref[i], - data$DGEN_past_5_Pref[i] - data$DGEN_fut_5_Pref[i], - data$ehiDGEN_5_Pref[i], - ifelse(abs((data$DGEN_past_5_Pref[i] - data$DGEN_fut_5_Pref[i]) - data$ehiDGEN_5_Pref[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" Pers: %g - %g = %g | ehiDGEN_5_Pers = %g %s\n", - data$DGEN_past_5_Pers[i], data$DGEN_fut_5_Pers[i], - data$DGEN_past_5_Pers[i] - data$DGEN_fut_5_Pers[i], - data$ehiDGEN_5_Pers[i], - ifelse(abs((data$DGEN_past_5_Pers[i] - data$DGEN_fut_5_Pers[i]) - data$ehiDGEN_5_Pers[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" Val: %g - %g = %g | ehiDGEN_5_Val = %g %s\n", - data$DGEN_past_5_Val[i], data$DGEN_fut_5_Val[i], - data$DGEN_past_5_Val[i] - data$DGEN_fut_5_Val[i], - data$ehiDGEN_5_Val[i], - ifelse(abs((data$DGEN_past_5_Val[i] - data$DGEN_fut_5_Val[i]) - data$ehiDGEN_5_Val[i]) < 1e-10, "✓", "✗"))) -} - -cat("\n--- 10-YEAR DGEN VARIABLES ---\n") -for (i in 1:5) { - cat(sprintf("\nRow %d:\n", i)) - cat(sprintf(" Pref: %g - %g = %g | ehiDGEN_10_Pref = %g %s\n", - data$DGEN_past_10_Pref[i], data$DGEN_fut_10_Pref[i], - data$DGEN_past_10_Pref[i] - data$DGEN_fut_10_Pref[i], - data$ehiDGEN_10_Pref[i], - ifelse(abs((data$DGEN_past_10_Pref[i] - data$DGEN_fut_10_Pref[i]) - data$ehiDGEN_10_Pref[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" Pers: %g - %g = %g | ehiDGEN_10_Pers = %g %s\n", - data$DGEN_past_10_Pers[i], data$DGEN_fut_10_Pers[i], - data$DGEN_past_10_Pers[i] - data$DGEN_fut_10_Pers[i], - data$ehiDGEN_10_Pers[i], - ifelse(abs((data$DGEN_past_10_Pers[i] - data$DGEN_fut_10_Pers[i]) - data$ehiDGEN_10_Pers[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" Val: %g - %g = %g | ehiDGEN_10_Val = %g %s\n", - data$DGEN_past_10_Val[i], data$DGEN_fut_10_Val[i], - data$DGEN_past_10_Val[i] - data$DGEN_fut_10_Val[i], - data$ehiDGEN_10_Val[i], - ifelse(abs((data$DGEN_past_10_Val[i] - data$DGEN_fut_10_Val[i]) - data$ehiDGEN_10_Val[i]) < 1e-10, "✓", "✗"))) -} - -# Full QA check for all rows and all variables -cat("\n\n=== OVERALL QA CHECK (ALL ROWS, ALL VARIABLES) ===\n") - -qa_pairs <- list( - # 5-year DGEN pairs - list(npast = "DGEN_past_5_Pref", nfut = "DGEN_fut_5_Pref", target = "ehiDGEN_5_Pref"), - list(npast = "DGEN_past_5_Pers", nfut = "DGEN_fut_5_Pers", target = "ehiDGEN_5_Pers"), - list(npast = "DGEN_past_5_Val", nfut = "DGEN_fut_5_Val", target = "ehiDGEN_5_Val"), - - # 10-year DGEN pairs - list(npast = "DGEN_past_10_Pref", nfut = "DGEN_fut_10_Pref", target = "ehiDGEN_10_Pref"), - list(npast = "DGEN_past_10_Pers", nfut = "DGEN_fut_10_Pers", target = "ehiDGEN_10_Pers"), - list(npast = "DGEN_past_10_Val", nfut = "DGEN_fut_10_Val", target = "ehiDGEN_10_Val") -) - -all_checks_passed <- TRUE - -for (pair in qa_pairs) { - # Calculate expected difference - expected_diff <- data[[pair$npast]] - data[[pair$nfut]] - - # Get actual value in target variable - actual_value <- data[[pair$target]] - - # Compare (allowing for floating point precision issues) - discrepancies <- which(abs(expected_diff - actual_value) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s\n", pair$target)) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - - # Show first discrepancy details - row_num <- discrepancies[1] - cat(sprintf(" Example (row %d): %s (%g) - %s (%g) = %g, but %s = %g\n", - row_num, - pair$npast, data[[pair$npast]][row_num], - pair$nfut, data[[pair$nfut]][row_num], - expected_diff[row_num], - pair$target, actual_value[row_num])) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s (n = %d)\n", pair$target, nrow(data))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(data, "eohi2.csv", row.names = FALSE) -cat("\nDataset saved to eohi2.csv\n") \ No newline at end of file diff --git a/.history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164934.r b/.history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164934.r deleted file mode 100644 index 5274cdd..0000000 --- a/.history/eohi2/datap 12 - CORRECT DGEN ehi vars_20251008164934.r +++ /dev/null @@ -1,118 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load data -data <- read.csv("eohi2.csv") - -# Create DGEN EHI difference variables (Past - Future) for different time intervals - -# === 5-YEAR DGEN PAST-FUTURE PAIRS === -data$ehiDGEN_5_Pref <- data$DGEN_past_5_Pref - data$DGEN_fut_5_Pref -data$ehiDGEN_5_Pers <- data$DGEN_past_5_Pers - data$DGEN_fut_5_Pers -data$ehiDGEN_5_Val <- data$DGEN_past_5_Val - data$DGEN_fut_5_Val - -# === 10-YEAR DGEN PAST-FUTURE PAIRS === -data$ehiDGEN_10_Pref <- data$DGEN_past_10_Pref - data$DGEN_fut_10_Pref -data$ehiDGEN_10_Pers <- data$DGEN_past_10_Pers - data$DGEN_fut_10_Pers -data$ehiDGEN_10_Val <- data$DGEN_past_10_Val - data$DGEN_fut_10_Val - -# QA: Verify calculations - FIRST 5 ROWS with detailed output -cat("\n=== QUALITY ASSURANCE CHECK - FIRST 5 ROWS ===\n\n") - -cat("--- 5-YEAR DGEN VARIABLES ---\n") -for (i in 1:5) { - cat(sprintf("\nRow %d:\n", i)) - cat(sprintf(" Pref: %g - %g = %g | ehiDGEN_5_Pref = %g %s\n", - data$DGEN_past_5_Pref[i], data$DGEN_fut_5_Pref[i], - data$DGEN_past_5_Pref[i] - data$DGEN_fut_5_Pref[i], - data$ehiDGEN_5_Pref[i], - ifelse(abs((data$DGEN_past_5_Pref[i] - data$DGEN_fut_5_Pref[i]) - data$ehiDGEN_5_Pref[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" Pers: %g - %g = %g | ehiDGEN_5_Pers = %g %s\n", - data$DGEN_past_5_Pers[i], data$DGEN_fut_5_Pers[i], - data$DGEN_past_5_Pers[i] - data$DGEN_fut_5_Pers[i], - data$ehiDGEN_5_Pers[i], - ifelse(abs((data$DGEN_past_5_Pers[i] - data$DGEN_fut_5_Pers[i]) - data$ehiDGEN_5_Pers[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" Val: %g - %g = %g | ehiDGEN_5_Val = %g %s\n", - data$DGEN_past_5_Val[i], data$DGEN_fut_5_Val[i], - data$DGEN_past_5_Val[i] - data$DGEN_fut_5_Val[i], - data$ehiDGEN_5_Val[i], - ifelse(abs((data$DGEN_past_5_Val[i] - data$DGEN_fut_5_Val[i]) - data$ehiDGEN_5_Val[i]) < 1e-10, "✓", "✗"))) -} - -cat("\n--- 10-YEAR DGEN VARIABLES ---\n") -for (i in 1:5) { - cat(sprintf("\nRow %d:\n", i)) - cat(sprintf(" Pref: %g - %g = %g | ehiDGEN_10_Pref = %g %s\n", - data$DGEN_past_10_Pref[i], data$DGEN_fut_10_Pref[i], - data$DGEN_past_10_Pref[i] - data$DGEN_fut_10_Pref[i], - data$ehiDGEN_10_Pref[i], - ifelse(abs((data$DGEN_past_10_Pref[i] - data$DGEN_fut_10_Pref[i]) - data$ehiDGEN_10_Pref[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" Pers: %g - %g = %g | ehiDGEN_10_Pers = %g %s\n", - data$DGEN_past_10_Pers[i], data$DGEN_fut_10_Pers[i], - data$DGEN_past_10_Pers[i] - data$DGEN_fut_10_Pers[i], - data$ehiDGEN_10_Pers[i], - ifelse(abs((data$DGEN_past_10_Pers[i] - data$DGEN_fut_10_Pers[i]) - data$ehiDGEN_10_Pers[i]) < 1e-10, "✓", "✗"))) - cat(sprintf(" Val: %g - %g = %g | ehiDGEN_10_Val = %g %s\n", - data$DGEN_past_10_Val[i], data$DGEN_fut_10_Val[i], - data$DGEN_past_10_Val[i] - data$DGEN_fut_10_Val[i], - data$ehiDGEN_10_Val[i], - ifelse(abs((data$DGEN_past_10_Val[i] - data$DGEN_fut_10_Val[i]) - data$ehiDGEN_10_Val[i]) < 1e-10, "✓", "✗"))) -} - -# Full QA check for all rows and all variables -cat("\n\n=== OVERALL QA CHECK (ALL ROWS, ALL VARIABLES) ===\n") - -qa_pairs <- list( - # 5-year DGEN pairs - list(npast = "DGEN_past_5_Pref", nfut = "DGEN_fut_5_Pref", target = "ehiDGEN_5_Pref"), - list(npast = "DGEN_past_5_Pers", nfut = "DGEN_fut_5_Pers", target = "ehiDGEN_5_Pers"), - list(npast = "DGEN_past_5_Val", nfut = "DGEN_fut_5_Val", target = "ehiDGEN_5_Val"), - - # 10-year DGEN pairs - list(npast = "DGEN_past_10_Pref", nfut = "DGEN_fut_10_Pref", target = "ehiDGEN_10_Pref"), - list(npast = "DGEN_past_10_Pers", nfut = "DGEN_fut_10_Pers", target = "ehiDGEN_10_Pers"), - list(npast = "DGEN_past_10_Val", nfut = "DGEN_fut_10_Val", target = "ehiDGEN_10_Val") -) - -all_checks_passed <- TRUE - -for (pair in qa_pairs) { - # Calculate expected difference - expected_diff <- data[[pair$npast]] - data[[pair$nfut]] - - # Get actual value in target variable - actual_value <- data[[pair$target]] - - # Compare (allowing for floating point precision issues) - discrepancies <- which(abs(expected_diff - actual_value) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s\n", pair$target)) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - - # Show first discrepancy details - row_num <- discrepancies[1] - cat(sprintf(" Example (row %d): %s (%g) - %s (%g) = %g, but %s = %g\n", - row_num, - pair$npast, data[[pair$npast]][row_num], - pair$nfut, data[[pair$nfut]][row_num], - expected_diff[row_num], - pair$target, actual_value[row_num])) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s (n = %d)\n", pair$target, nrow(data))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(data, "eohi2.csv", row.names = FALSE) -cat("\nDataset saved to eohi2.csv\n") \ No newline at end of file diff --git a/.history/eohi2/datap 13 - ehi domain specific means_20251008165421.r b/.history/eohi2/datap 13 - ehi domain specific means_20251008165421.r deleted file mode 100644 index e655492..0000000 --- a/.history/eohi2/datap 13 - ehi domain specific means_20251008165421.r +++ /dev/null @@ -1,161 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load data -data <- read.csv("eohi2.csv") - -# Calculate domain-specific mean scores for EHI variables across time intervals - -# === 5-YEAR MEANS === -data$ehi5_pref_MEAN <- rowMeans(data[, c("ehi5_pref_read", "ehi5_pref_music", - "ehi5_pref_TV", "ehi5_pref_nap", - "ehi5_pref_travel")], na.rm = TRUE) - -data$ehi5_pers_MEAN <- rowMeans(data[, c("ehi5_pers_extravert", "ehi5_pers_critical", - "ehi5_pers_dependable", "ehi5_pers_anxious", - "ehi5_pers_complex")], na.rm = TRUE) - -data$ehi5_val_MEAN <- rowMeans(data[, c("ehi5_val_obey", "ehi5_val_trad", - "ehi5_val_opinion", "ehi5_val_performance", - "ehi5_val_justice")], na.rm = TRUE) - -# === 10-YEAR MEANS === -data$ehi10_pref_MEAN <- rowMeans(data[, c("ehi10_pref_read", "ehi10_pref_music", - "ehi10_pref_TV", "ehi10_pref_nap", - "ehi10_pref_travel")], na.rm = TRUE) - -data$ehi10_pers_MEAN <- rowMeans(data[, c("ehi10_pers_extravert", "ehi10_pers_critical", - "ehi10_pers_dependable", "ehi10_pers_anxious", - "ehi10_pers_complex")], na.rm = TRUE) - -data$ehi10_val_MEAN <- rowMeans(data[, c("ehi10_val_obey", "ehi10_val_trad", - "ehi10_val_opinion", "ehi10_val_performance", - "ehi10_val_justice")], na.rm = TRUE) - -# === 5-10 YEAR CHANGE MEANS === -data$ehi5.10_pref_MEAN <- rowMeans(data[, c("ehi5.10_pref_read", "ehi5.10_pref_music", - "ehi5.10_pref_TV", "ehi5.10_pref_nap", - "ehi5.10_pref_travel")], na.rm = TRUE) - -data$ehi5.10_pers_MEAN <- rowMeans(data[, c("ehi5.10_pers_extravert", "ehi5.10_pers_critical", - "ehi5.10_pers_dependable", "ehi5.10_pers_anxious", - "ehi5.10_pers_complex")], na.rm = TRUE) - -data$ehi5.10_val_MEAN <- rowMeans(data[, c("ehi5.10_val_obey", "ehi5.10_val_trad", - "ehi5.10_val_opinion", "ehi5.10_val_performance", - "ehi5.10_val_justice")], na.rm = TRUE) - -# QA: Verify mean calculations -cat("\n=== QUALITY ASSURANCE CHECK ===\n") -cat("Verifying EHI domain-specific mean calculations\n\n") - -cat("--- FIRST 5 ROWS: 5-YEAR PREFERENCES MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5_pref_read[i], data$ehi5_pref_music[i], - data$ehi5_pref_TV[i], data$ehi5_pref_nap[i], - data$ehi5_pref_travel[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5_pref_MEAN[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 5-YEAR PERSONALITY MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5_pers_extravert[i], data$ehi5_pers_critical[i], - data$ehi5_pers_dependable[i], data$ehi5_pers_anxious[i], - data$ehi5_pers_complex[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5_pers_MEAN[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 10-YEAR PREFERENCES MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi10_pref_read[i], data$ehi10_pref_music[i], - data$ehi10_pref_TV[i], data$ehi10_pref_nap[i], - data$ehi10_pref_travel[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi10_pref_MEAN[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 5-10 YEAR CHANGE PREFERENCES MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5.10_pref_read[i], data$ehi5.10_pref_music[i], - data$ehi5.10_pref_TV[i], data$ehi5.10_pref_nap[i], - data$ehi5.10_pref_travel[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5.10_pref_MEAN[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -# Overall QA check for all rows -cat("\n--- OVERALL QA CHECK (ALL ROWS) ---\n") - -qa_checks <- list( - # 5-year means - list(vars = c("ehi5_pref_read", "ehi5_pref_music", "ehi5_pref_TV", "ehi5_pref_nap", "ehi5_pref_travel"), - target = "ehi5_pref_MEAN", name = "5-Year Preferences"), - list(vars = c("ehi5_pers_extravert", "ehi5_pers_critical", "ehi5_pers_dependable", "ehi5_pers_anxious", "ehi5_pers_complex"), - target = "ehi5_pers_MEAN", name = "5-Year Personality"), - list(vars = c("ehi5_val_obey", "ehi5_val_trad", "ehi5_val_opinion", "ehi5_val_performance", "ehi5_val_justice"), - target = "ehi5_val_MEAN", name = "5-Year Values"), - - # 10-year means - list(vars = c("ehi10_pref_read", "ehi10_pref_music", "ehi10_pref_TV", "ehi10_pref_nap", "ehi10_pref_travel"), - target = "ehi10_pref_MEAN", name = "10-Year Preferences"), - list(vars = c("ehi10_pers_extravert", "ehi10_pers_critical", "ehi10_pers_dependable", "ehi10_pers_anxious", "ehi10_pers_complex"), - target = "ehi10_pers_MEAN", name = "10-Year Personality"), - list(vars = c("ehi10_val_obey", "ehi10_val_trad", "ehi10_val_opinion", "ehi10_val_performance", "ehi10_val_justice"), - target = "ehi10_val_MEAN", name = "10-Year Values"), - - # 5-10 year change means - list(vars = c("ehi5.10_pref_read", "ehi5.10_pref_music", "ehi5.10_pref_TV", "ehi5.10_pref_nap", "ehi5.10_pref_travel"), - target = "ehi5.10_pref_MEAN", name = "5-10 Year Change Preferences"), - list(vars = c("ehi5.10_pers_extravert", "ehi5.10_pers_critical", "ehi5.10_pers_dependable", "ehi5.10_pers_anxious", "ehi5.10_pers_complex"), - target = "ehi5.10_pers_MEAN", name = "5-10 Year Change Personality"), - list(vars = c("ehi5.10_val_obey", "ehi5.10_val_trad", "ehi5.10_val_opinion", "ehi5.10_val_performance", "ehi5.10_val_justice"), - target = "ehi5.10_val_MEAN", name = "5-10 Year Change Values") -) - -all_checks_passed <- TRUE - -for (check in qa_checks) { - calc_mean <- rowMeans(data[, check$vars], na.rm = TRUE) - actual_mean <- data[[check$target]] - discrepancies <- which(abs(calc_mean - actual_mean) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s mean (n_vars = %d)\n", check$name, length(check$vars))) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s mean (n_vars = %d, n_rows = %d)\n", - check$name, length(check$vars), nrow(data))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(data, "eohi2.csv", row.names = FALSE) -cat("\nDataset saved to eohi2.csv\n") \ No newline at end of file diff --git a/.history/eohi2/datap 13 - ehi domain specific means_20251008165443.r b/.history/eohi2/datap 13 - ehi domain specific means_20251008165443.r deleted file mode 100644 index e655492..0000000 --- a/.history/eohi2/datap 13 - ehi domain specific means_20251008165443.r +++ /dev/null @@ -1,161 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load data -data <- read.csv("eohi2.csv") - -# Calculate domain-specific mean scores for EHI variables across time intervals - -# === 5-YEAR MEANS === -data$ehi5_pref_MEAN <- rowMeans(data[, c("ehi5_pref_read", "ehi5_pref_music", - "ehi5_pref_TV", "ehi5_pref_nap", - "ehi5_pref_travel")], na.rm = TRUE) - -data$ehi5_pers_MEAN <- rowMeans(data[, c("ehi5_pers_extravert", "ehi5_pers_critical", - "ehi5_pers_dependable", "ehi5_pers_anxious", - "ehi5_pers_complex")], na.rm = TRUE) - -data$ehi5_val_MEAN <- rowMeans(data[, c("ehi5_val_obey", "ehi5_val_trad", - "ehi5_val_opinion", "ehi5_val_performance", - "ehi5_val_justice")], na.rm = TRUE) - -# === 10-YEAR MEANS === -data$ehi10_pref_MEAN <- rowMeans(data[, c("ehi10_pref_read", "ehi10_pref_music", - "ehi10_pref_TV", "ehi10_pref_nap", - "ehi10_pref_travel")], na.rm = TRUE) - -data$ehi10_pers_MEAN <- rowMeans(data[, c("ehi10_pers_extravert", "ehi10_pers_critical", - "ehi10_pers_dependable", "ehi10_pers_anxious", - "ehi10_pers_complex")], na.rm = TRUE) - -data$ehi10_val_MEAN <- rowMeans(data[, c("ehi10_val_obey", "ehi10_val_trad", - "ehi10_val_opinion", "ehi10_val_performance", - "ehi10_val_justice")], na.rm = TRUE) - -# === 5-10 YEAR CHANGE MEANS === -data$ehi5.10_pref_MEAN <- rowMeans(data[, c("ehi5.10_pref_read", "ehi5.10_pref_music", - "ehi5.10_pref_TV", "ehi5.10_pref_nap", - "ehi5.10_pref_travel")], na.rm = TRUE) - -data$ehi5.10_pers_MEAN <- rowMeans(data[, c("ehi5.10_pers_extravert", "ehi5.10_pers_critical", - "ehi5.10_pers_dependable", "ehi5.10_pers_anxious", - "ehi5.10_pers_complex")], na.rm = TRUE) - -data$ehi5.10_val_MEAN <- rowMeans(data[, c("ehi5.10_val_obey", "ehi5.10_val_trad", - "ehi5.10_val_opinion", "ehi5.10_val_performance", - "ehi5.10_val_justice")], na.rm = TRUE) - -# QA: Verify mean calculations -cat("\n=== QUALITY ASSURANCE CHECK ===\n") -cat("Verifying EHI domain-specific mean calculations\n\n") - -cat("--- FIRST 5 ROWS: 5-YEAR PREFERENCES MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5_pref_read[i], data$ehi5_pref_music[i], - data$ehi5_pref_TV[i], data$ehi5_pref_nap[i], - data$ehi5_pref_travel[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5_pref_MEAN[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 5-YEAR PERSONALITY MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5_pers_extravert[i], data$ehi5_pers_critical[i], - data$ehi5_pers_dependable[i], data$ehi5_pers_anxious[i], - data$ehi5_pers_complex[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5_pers_MEAN[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 10-YEAR PREFERENCES MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi10_pref_read[i], data$ehi10_pref_music[i], - data$ehi10_pref_TV[i], data$ehi10_pref_nap[i], - data$ehi10_pref_travel[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi10_pref_MEAN[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 5-10 YEAR CHANGE PREFERENCES MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5.10_pref_read[i], data$ehi5.10_pref_music[i], - data$ehi5.10_pref_TV[i], data$ehi5.10_pref_nap[i], - data$ehi5.10_pref_travel[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5.10_pref_MEAN[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -# Overall QA check for all rows -cat("\n--- OVERALL QA CHECK (ALL ROWS) ---\n") - -qa_checks <- list( - # 5-year means - list(vars = c("ehi5_pref_read", "ehi5_pref_music", "ehi5_pref_TV", "ehi5_pref_nap", "ehi5_pref_travel"), - target = "ehi5_pref_MEAN", name = "5-Year Preferences"), - list(vars = c("ehi5_pers_extravert", "ehi5_pers_critical", "ehi5_pers_dependable", "ehi5_pers_anxious", "ehi5_pers_complex"), - target = "ehi5_pers_MEAN", name = "5-Year Personality"), - list(vars = c("ehi5_val_obey", "ehi5_val_trad", "ehi5_val_opinion", "ehi5_val_performance", "ehi5_val_justice"), - target = "ehi5_val_MEAN", name = "5-Year Values"), - - # 10-year means - list(vars = c("ehi10_pref_read", "ehi10_pref_music", "ehi10_pref_TV", "ehi10_pref_nap", "ehi10_pref_travel"), - target = "ehi10_pref_MEAN", name = "10-Year Preferences"), - list(vars = c("ehi10_pers_extravert", "ehi10_pers_critical", "ehi10_pers_dependable", "ehi10_pers_anxious", "ehi10_pers_complex"), - target = "ehi10_pers_MEAN", name = "10-Year Personality"), - list(vars = c("ehi10_val_obey", "ehi10_val_trad", "ehi10_val_opinion", "ehi10_val_performance", "ehi10_val_justice"), - target = "ehi10_val_MEAN", name = "10-Year Values"), - - # 5-10 year change means - list(vars = c("ehi5.10_pref_read", "ehi5.10_pref_music", "ehi5.10_pref_TV", "ehi5.10_pref_nap", "ehi5.10_pref_travel"), - target = "ehi5.10_pref_MEAN", name = "5-10 Year Change Preferences"), - list(vars = c("ehi5.10_pers_extravert", "ehi5.10_pers_critical", "ehi5.10_pers_dependable", "ehi5.10_pers_anxious", "ehi5.10_pers_complex"), - target = "ehi5.10_pers_MEAN", name = "5-10 Year Change Personality"), - list(vars = c("ehi5.10_val_obey", "ehi5.10_val_trad", "ehi5.10_val_opinion", "ehi5.10_val_performance", "ehi5.10_val_justice"), - target = "ehi5.10_val_MEAN", name = "5-10 Year Change Values") -) - -all_checks_passed <- TRUE - -for (check in qa_checks) { - calc_mean <- rowMeans(data[, check$vars], na.rm = TRUE) - actual_mean <- data[[check$target]] - discrepancies <- which(abs(calc_mean - actual_mean) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s mean (n_vars = %d)\n", check$name, length(check$vars))) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s mean (n_vars = %d, n_rows = %d)\n", - check$name, length(check$vars), nrow(data))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(data, "eohi2.csv", row.names = FALSE) -cat("\nDataset saved to eohi2.csv\n") \ No newline at end of file diff --git a/.history/eohi2/datap 13 - ehi domain specific means_20251008165448.r b/.history/eohi2/datap 13 - ehi domain specific means_20251008165448.r deleted file mode 100644 index e655492..0000000 --- a/.history/eohi2/datap 13 - ehi domain specific means_20251008165448.r +++ /dev/null @@ -1,161 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load data -data <- read.csv("eohi2.csv") - -# Calculate domain-specific mean scores for EHI variables across time intervals - -# === 5-YEAR MEANS === -data$ehi5_pref_MEAN <- rowMeans(data[, c("ehi5_pref_read", "ehi5_pref_music", - "ehi5_pref_TV", "ehi5_pref_nap", - "ehi5_pref_travel")], na.rm = TRUE) - -data$ehi5_pers_MEAN <- rowMeans(data[, c("ehi5_pers_extravert", "ehi5_pers_critical", - "ehi5_pers_dependable", "ehi5_pers_anxious", - "ehi5_pers_complex")], na.rm = TRUE) - -data$ehi5_val_MEAN <- rowMeans(data[, c("ehi5_val_obey", "ehi5_val_trad", - "ehi5_val_opinion", "ehi5_val_performance", - "ehi5_val_justice")], na.rm = TRUE) - -# === 10-YEAR MEANS === -data$ehi10_pref_MEAN <- rowMeans(data[, c("ehi10_pref_read", "ehi10_pref_music", - "ehi10_pref_TV", "ehi10_pref_nap", - "ehi10_pref_travel")], na.rm = TRUE) - -data$ehi10_pers_MEAN <- rowMeans(data[, c("ehi10_pers_extravert", "ehi10_pers_critical", - "ehi10_pers_dependable", "ehi10_pers_anxious", - "ehi10_pers_complex")], na.rm = TRUE) - -data$ehi10_val_MEAN <- rowMeans(data[, c("ehi10_val_obey", "ehi10_val_trad", - "ehi10_val_opinion", "ehi10_val_performance", - "ehi10_val_justice")], na.rm = TRUE) - -# === 5-10 YEAR CHANGE MEANS === -data$ehi5.10_pref_MEAN <- rowMeans(data[, c("ehi5.10_pref_read", "ehi5.10_pref_music", - "ehi5.10_pref_TV", "ehi5.10_pref_nap", - "ehi5.10_pref_travel")], na.rm = TRUE) - -data$ehi5.10_pers_MEAN <- rowMeans(data[, c("ehi5.10_pers_extravert", "ehi5.10_pers_critical", - "ehi5.10_pers_dependable", "ehi5.10_pers_anxious", - "ehi5.10_pers_complex")], na.rm = TRUE) - -data$ehi5.10_val_MEAN <- rowMeans(data[, c("ehi5.10_val_obey", "ehi5.10_val_trad", - "ehi5.10_val_opinion", "ehi5.10_val_performance", - "ehi5.10_val_justice")], na.rm = TRUE) - -# QA: Verify mean calculations -cat("\n=== QUALITY ASSURANCE CHECK ===\n") -cat("Verifying EHI domain-specific mean calculations\n\n") - -cat("--- FIRST 5 ROWS: 5-YEAR PREFERENCES MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5_pref_read[i], data$ehi5_pref_music[i], - data$ehi5_pref_TV[i], data$ehi5_pref_nap[i], - data$ehi5_pref_travel[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5_pref_MEAN[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 5-YEAR PERSONALITY MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5_pers_extravert[i], data$ehi5_pers_critical[i], - data$ehi5_pers_dependable[i], data$ehi5_pers_anxious[i], - data$ehi5_pers_complex[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5_pers_MEAN[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 10-YEAR PREFERENCES MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi10_pref_read[i], data$ehi10_pref_music[i], - data$ehi10_pref_TV[i], data$ehi10_pref_nap[i], - data$ehi10_pref_travel[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi10_pref_MEAN[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 5-10 YEAR CHANGE PREFERENCES MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5.10_pref_read[i], data$ehi5.10_pref_music[i], - data$ehi5.10_pref_TV[i], data$ehi5.10_pref_nap[i], - data$ehi5.10_pref_travel[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5.10_pref_MEAN[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -# Overall QA check for all rows -cat("\n--- OVERALL QA CHECK (ALL ROWS) ---\n") - -qa_checks <- list( - # 5-year means - list(vars = c("ehi5_pref_read", "ehi5_pref_music", "ehi5_pref_TV", "ehi5_pref_nap", "ehi5_pref_travel"), - target = "ehi5_pref_MEAN", name = "5-Year Preferences"), - list(vars = c("ehi5_pers_extravert", "ehi5_pers_critical", "ehi5_pers_dependable", "ehi5_pers_anxious", "ehi5_pers_complex"), - target = "ehi5_pers_MEAN", name = "5-Year Personality"), - list(vars = c("ehi5_val_obey", "ehi5_val_trad", "ehi5_val_opinion", "ehi5_val_performance", "ehi5_val_justice"), - target = "ehi5_val_MEAN", name = "5-Year Values"), - - # 10-year means - list(vars = c("ehi10_pref_read", "ehi10_pref_music", "ehi10_pref_TV", "ehi10_pref_nap", "ehi10_pref_travel"), - target = "ehi10_pref_MEAN", name = "10-Year Preferences"), - list(vars = c("ehi10_pers_extravert", "ehi10_pers_critical", "ehi10_pers_dependable", "ehi10_pers_anxious", "ehi10_pers_complex"), - target = "ehi10_pers_MEAN", name = "10-Year Personality"), - list(vars = c("ehi10_val_obey", "ehi10_val_trad", "ehi10_val_opinion", "ehi10_val_performance", "ehi10_val_justice"), - target = "ehi10_val_MEAN", name = "10-Year Values"), - - # 5-10 year change means - list(vars = c("ehi5.10_pref_read", "ehi5.10_pref_music", "ehi5.10_pref_TV", "ehi5.10_pref_nap", "ehi5.10_pref_travel"), - target = "ehi5.10_pref_MEAN", name = "5-10 Year Change Preferences"), - list(vars = c("ehi5.10_pers_extravert", "ehi5.10_pers_critical", "ehi5.10_pers_dependable", "ehi5.10_pers_anxious", "ehi5.10_pers_complex"), - target = "ehi5.10_pers_MEAN", name = "5-10 Year Change Personality"), - list(vars = c("ehi5.10_val_obey", "ehi5.10_val_trad", "ehi5.10_val_opinion", "ehi5.10_val_performance", "ehi5.10_val_justice"), - target = "ehi5.10_val_MEAN", name = "5-10 Year Change Values") -) - -all_checks_passed <- TRUE - -for (check in qa_checks) { - calc_mean <- rowMeans(data[, check$vars], na.rm = TRUE) - actual_mean <- data[[check$target]] - discrepancies <- which(abs(calc_mean - actual_mean) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s mean (n_vars = %d)\n", check$name, length(check$vars))) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s mean (n_vars = %d, n_rows = %d)\n", - check$name, length(check$vars), nrow(data))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(data, "eohi2.csv", row.names = FALSE) -cat("\nDataset saved to eohi2.csv\n") \ No newline at end of file diff --git a/.history/eohi2/datap 13 - ehi domain specific means_20251008165551.r b/.history/eohi2/datap 13 - ehi domain specific means_20251008165551.r deleted file mode 100644 index e655492..0000000 --- a/.history/eohi2/datap 13 - ehi domain specific means_20251008165551.r +++ /dev/null @@ -1,161 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load data -data <- read.csv("eohi2.csv") - -# Calculate domain-specific mean scores for EHI variables across time intervals - -# === 5-YEAR MEANS === -data$ehi5_pref_MEAN <- rowMeans(data[, c("ehi5_pref_read", "ehi5_pref_music", - "ehi5_pref_TV", "ehi5_pref_nap", - "ehi5_pref_travel")], na.rm = TRUE) - -data$ehi5_pers_MEAN <- rowMeans(data[, c("ehi5_pers_extravert", "ehi5_pers_critical", - "ehi5_pers_dependable", "ehi5_pers_anxious", - "ehi5_pers_complex")], na.rm = TRUE) - -data$ehi5_val_MEAN <- rowMeans(data[, c("ehi5_val_obey", "ehi5_val_trad", - "ehi5_val_opinion", "ehi5_val_performance", - "ehi5_val_justice")], na.rm = TRUE) - -# === 10-YEAR MEANS === -data$ehi10_pref_MEAN <- rowMeans(data[, c("ehi10_pref_read", "ehi10_pref_music", - "ehi10_pref_TV", "ehi10_pref_nap", - "ehi10_pref_travel")], na.rm = TRUE) - -data$ehi10_pers_MEAN <- rowMeans(data[, c("ehi10_pers_extravert", "ehi10_pers_critical", - "ehi10_pers_dependable", "ehi10_pers_anxious", - "ehi10_pers_complex")], na.rm = TRUE) - -data$ehi10_val_MEAN <- rowMeans(data[, c("ehi10_val_obey", "ehi10_val_trad", - "ehi10_val_opinion", "ehi10_val_performance", - "ehi10_val_justice")], na.rm = TRUE) - -# === 5-10 YEAR CHANGE MEANS === -data$ehi5.10_pref_MEAN <- rowMeans(data[, c("ehi5.10_pref_read", "ehi5.10_pref_music", - "ehi5.10_pref_TV", "ehi5.10_pref_nap", - "ehi5.10_pref_travel")], na.rm = TRUE) - -data$ehi5.10_pers_MEAN <- rowMeans(data[, c("ehi5.10_pers_extravert", "ehi5.10_pers_critical", - "ehi5.10_pers_dependable", "ehi5.10_pers_anxious", - "ehi5.10_pers_complex")], na.rm = TRUE) - -data$ehi5.10_val_MEAN <- rowMeans(data[, c("ehi5.10_val_obey", "ehi5.10_val_trad", - "ehi5.10_val_opinion", "ehi5.10_val_performance", - "ehi5.10_val_justice")], na.rm = TRUE) - -# QA: Verify mean calculations -cat("\n=== QUALITY ASSURANCE CHECK ===\n") -cat("Verifying EHI domain-specific mean calculations\n\n") - -cat("--- FIRST 5 ROWS: 5-YEAR PREFERENCES MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5_pref_read[i], data$ehi5_pref_music[i], - data$ehi5_pref_TV[i], data$ehi5_pref_nap[i], - data$ehi5_pref_travel[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5_pref_MEAN[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 5-YEAR PERSONALITY MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5_pers_extravert[i], data$ehi5_pers_critical[i], - data$ehi5_pers_dependable[i], data$ehi5_pers_anxious[i], - data$ehi5_pers_complex[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5_pers_MEAN[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 10-YEAR PREFERENCES MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi10_pref_read[i], data$ehi10_pref_music[i], - data$ehi10_pref_TV[i], data$ehi10_pref_nap[i], - data$ehi10_pref_travel[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi10_pref_MEAN[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 5-10 YEAR CHANGE PREFERENCES MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5.10_pref_read[i], data$ehi5.10_pref_music[i], - data$ehi5.10_pref_TV[i], data$ehi5.10_pref_nap[i], - data$ehi5.10_pref_travel[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5.10_pref_MEAN[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -# Overall QA check for all rows -cat("\n--- OVERALL QA CHECK (ALL ROWS) ---\n") - -qa_checks <- list( - # 5-year means - list(vars = c("ehi5_pref_read", "ehi5_pref_music", "ehi5_pref_TV", "ehi5_pref_nap", "ehi5_pref_travel"), - target = "ehi5_pref_MEAN", name = "5-Year Preferences"), - list(vars = c("ehi5_pers_extravert", "ehi5_pers_critical", "ehi5_pers_dependable", "ehi5_pers_anxious", "ehi5_pers_complex"), - target = "ehi5_pers_MEAN", name = "5-Year Personality"), - list(vars = c("ehi5_val_obey", "ehi5_val_trad", "ehi5_val_opinion", "ehi5_val_performance", "ehi5_val_justice"), - target = "ehi5_val_MEAN", name = "5-Year Values"), - - # 10-year means - list(vars = c("ehi10_pref_read", "ehi10_pref_music", "ehi10_pref_TV", "ehi10_pref_nap", "ehi10_pref_travel"), - target = "ehi10_pref_MEAN", name = "10-Year Preferences"), - list(vars = c("ehi10_pers_extravert", "ehi10_pers_critical", "ehi10_pers_dependable", "ehi10_pers_anxious", "ehi10_pers_complex"), - target = "ehi10_pers_MEAN", name = "10-Year Personality"), - list(vars = c("ehi10_val_obey", "ehi10_val_trad", "ehi10_val_opinion", "ehi10_val_performance", "ehi10_val_justice"), - target = "ehi10_val_MEAN", name = "10-Year Values"), - - # 5-10 year change means - list(vars = c("ehi5.10_pref_read", "ehi5.10_pref_music", "ehi5.10_pref_TV", "ehi5.10_pref_nap", "ehi5.10_pref_travel"), - target = "ehi5.10_pref_MEAN", name = "5-10 Year Change Preferences"), - list(vars = c("ehi5.10_pers_extravert", "ehi5.10_pers_critical", "ehi5.10_pers_dependable", "ehi5.10_pers_anxious", "ehi5.10_pers_complex"), - target = "ehi5.10_pers_MEAN", name = "5-10 Year Change Personality"), - list(vars = c("ehi5.10_val_obey", "ehi5.10_val_trad", "ehi5.10_val_opinion", "ehi5.10_val_performance", "ehi5.10_val_justice"), - target = "ehi5.10_val_MEAN", name = "5-10 Year Change Values") -) - -all_checks_passed <- TRUE - -for (check in qa_checks) { - calc_mean <- rowMeans(data[, check$vars], na.rm = TRUE) - actual_mean <- data[[check$target]] - discrepancies <- which(abs(calc_mean - actual_mean) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s mean (n_vars = %d)\n", check$name, length(check$vars))) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s mean (n_vars = %d, n_rows = %d)\n", - check$name, length(check$vars), nrow(data))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(data, "eohi2.csv", row.names = FALSE) -cat("\nDataset saved to eohi2.csv\n") \ No newline at end of file diff --git a/.history/eohi2/datap 13 - ehi domain specific means_20251008170838.r b/.history/eohi2/datap 13 - ehi domain specific means_20251008170838.r deleted file mode 100644 index e655492..0000000 --- a/.history/eohi2/datap 13 - ehi domain specific means_20251008170838.r +++ /dev/null @@ -1,161 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load data -data <- read.csv("eohi2.csv") - -# Calculate domain-specific mean scores for EHI variables across time intervals - -# === 5-YEAR MEANS === -data$ehi5_pref_MEAN <- rowMeans(data[, c("ehi5_pref_read", "ehi5_pref_music", - "ehi5_pref_TV", "ehi5_pref_nap", - "ehi5_pref_travel")], na.rm = TRUE) - -data$ehi5_pers_MEAN <- rowMeans(data[, c("ehi5_pers_extravert", "ehi5_pers_critical", - "ehi5_pers_dependable", "ehi5_pers_anxious", - "ehi5_pers_complex")], na.rm = TRUE) - -data$ehi5_val_MEAN <- rowMeans(data[, c("ehi5_val_obey", "ehi5_val_trad", - "ehi5_val_opinion", "ehi5_val_performance", - "ehi5_val_justice")], na.rm = TRUE) - -# === 10-YEAR MEANS === -data$ehi10_pref_MEAN <- rowMeans(data[, c("ehi10_pref_read", "ehi10_pref_music", - "ehi10_pref_TV", "ehi10_pref_nap", - "ehi10_pref_travel")], na.rm = TRUE) - -data$ehi10_pers_MEAN <- rowMeans(data[, c("ehi10_pers_extravert", "ehi10_pers_critical", - "ehi10_pers_dependable", "ehi10_pers_anxious", - "ehi10_pers_complex")], na.rm = TRUE) - -data$ehi10_val_MEAN <- rowMeans(data[, c("ehi10_val_obey", "ehi10_val_trad", - "ehi10_val_opinion", "ehi10_val_performance", - "ehi10_val_justice")], na.rm = TRUE) - -# === 5-10 YEAR CHANGE MEANS === -data$ehi5.10_pref_MEAN <- rowMeans(data[, c("ehi5.10_pref_read", "ehi5.10_pref_music", - "ehi5.10_pref_TV", "ehi5.10_pref_nap", - "ehi5.10_pref_travel")], na.rm = TRUE) - -data$ehi5.10_pers_MEAN <- rowMeans(data[, c("ehi5.10_pers_extravert", "ehi5.10_pers_critical", - "ehi5.10_pers_dependable", "ehi5.10_pers_anxious", - "ehi5.10_pers_complex")], na.rm = TRUE) - -data$ehi5.10_val_MEAN <- rowMeans(data[, c("ehi5.10_val_obey", "ehi5.10_val_trad", - "ehi5.10_val_opinion", "ehi5.10_val_performance", - "ehi5.10_val_justice")], na.rm = TRUE) - -# QA: Verify mean calculations -cat("\n=== QUALITY ASSURANCE CHECK ===\n") -cat("Verifying EHI domain-specific mean calculations\n\n") - -cat("--- FIRST 5 ROWS: 5-YEAR PREFERENCES MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5_pref_read[i], data$ehi5_pref_music[i], - data$ehi5_pref_TV[i], data$ehi5_pref_nap[i], - data$ehi5_pref_travel[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5_pref_MEAN[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 5-YEAR PERSONALITY MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5_pers_extravert[i], data$ehi5_pers_critical[i], - data$ehi5_pers_dependable[i], data$ehi5_pers_anxious[i], - data$ehi5_pers_complex[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5_pers_MEAN[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 10-YEAR PREFERENCES MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi10_pref_read[i], data$ehi10_pref_music[i], - data$ehi10_pref_TV[i], data$ehi10_pref_nap[i], - data$ehi10_pref_travel[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi10_pref_MEAN[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 5-10 YEAR CHANGE PREFERENCES MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5.10_pref_read[i], data$ehi5.10_pref_music[i], - data$ehi5.10_pref_TV[i], data$ehi5.10_pref_nap[i], - data$ehi5.10_pref_travel[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5.10_pref_MEAN[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -# Overall QA check for all rows -cat("\n--- OVERALL QA CHECK (ALL ROWS) ---\n") - -qa_checks <- list( - # 5-year means - list(vars = c("ehi5_pref_read", "ehi5_pref_music", "ehi5_pref_TV", "ehi5_pref_nap", "ehi5_pref_travel"), - target = "ehi5_pref_MEAN", name = "5-Year Preferences"), - list(vars = c("ehi5_pers_extravert", "ehi5_pers_critical", "ehi5_pers_dependable", "ehi5_pers_anxious", "ehi5_pers_complex"), - target = "ehi5_pers_MEAN", name = "5-Year Personality"), - list(vars = c("ehi5_val_obey", "ehi5_val_trad", "ehi5_val_opinion", "ehi5_val_performance", "ehi5_val_justice"), - target = "ehi5_val_MEAN", name = "5-Year Values"), - - # 10-year means - list(vars = c("ehi10_pref_read", "ehi10_pref_music", "ehi10_pref_TV", "ehi10_pref_nap", "ehi10_pref_travel"), - target = "ehi10_pref_MEAN", name = "10-Year Preferences"), - list(vars = c("ehi10_pers_extravert", "ehi10_pers_critical", "ehi10_pers_dependable", "ehi10_pers_anxious", "ehi10_pers_complex"), - target = "ehi10_pers_MEAN", name = "10-Year Personality"), - list(vars = c("ehi10_val_obey", "ehi10_val_trad", "ehi10_val_opinion", "ehi10_val_performance", "ehi10_val_justice"), - target = "ehi10_val_MEAN", name = "10-Year Values"), - - # 5-10 year change means - list(vars = c("ehi5.10_pref_read", "ehi5.10_pref_music", "ehi5.10_pref_TV", "ehi5.10_pref_nap", "ehi5.10_pref_travel"), - target = "ehi5.10_pref_MEAN", name = "5-10 Year Change Preferences"), - list(vars = c("ehi5.10_pers_extravert", "ehi5.10_pers_critical", "ehi5.10_pers_dependable", "ehi5.10_pers_anxious", "ehi5.10_pers_complex"), - target = "ehi5.10_pers_MEAN", name = "5-10 Year Change Personality"), - list(vars = c("ehi5.10_val_obey", "ehi5.10_val_trad", "ehi5.10_val_opinion", "ehi5.10_val_performance", "ehi5.10_val_justice"), - target = "ehi5.10_val_MEAN", name = "5-10 Year Change Values") -) - -all_checks_passed <- TRUE - -for (check in qa_checks) { - calc_mean <- rowMeans(data[, check$vars], na.rm = TRUE) - actual_mean <- data[[check$target]] - discrepancies <- which(abs(calc_mean - actual_mean) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s mean (n_vars = %d)\n", check$name, length(check$vars))) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s mean (n_vars = %d, n_rows = %d)\n", - check$name, length(check$vars), nrow(data))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(data, "eohi2.csv", row.names = FALSE) -cat("\nDataset saved to eohi2.csv\n") \ No newline at end of file diff --git a/.history/eohi2/datap 14 - all ehi global means_20251008171057.r b/.history/eohi2/datap 14 - all ehi global means_20251008171057.r deleted file mode 100644 index 4dae188..0000000 --- a/.history/eohi2/datap 14 - all ehi global means_20251008171057.r +++ /dev/null @@ -1,142 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load data -data <- read.csv("eohi2.csv") - -# Calculate global mean scores for EHI variables across time intervals - -# === DGEN 5-YEAR GLOBAL MEAN === -data$ehiDGEN_5_mean <- rowMeans(data[, c("ehiDGEN_5_Pref", "ehiDGEN_5_Pers", - "ehiDGEN_5_Val")], na.rm = TRUE) - -# === DGEN 10-YEAR GLOBAL MEAN === -data$ehiDGEN_10_mean <- rowMeans(data[, c("ehiDGEN_10_Pref", "ehiDGEN_10_Pers", - "ehiDGEN_10_Val")], na.rm = TRUE) - -# === 5-YEAR GLOBAL MEAN === -data$ehi5_global_mean <- rowMeans(data[, c("ehi5_pref_MEAN", "ehi5_pers_MEAN", - "ehi5_val_MEAN")], na.rm = TRUE) - -# === 10-YEAR GLOBAL MEAN === -data$ehi10_global_mean <- rowMeans(data[, c("ehi10_pref_MEAN", "ehi10_pers_MEAN", - "ehi10_val_MEAN")], na.rm = TRUE) - -# === 5-10 YEAR CHANGE GLOBAL MEAN === -data$ehi5.10_global_mean <- rowMeans(data[, c("ehi5.10_pref_MEAN", "ehi5.10_pers_MEAN", - "ehi5.10_val_MEAN")], na.rm = TRUE) - -# QA: Verify mean calculations -cat("\n=== QUALITY ASSURANCE CHECK ===\n") -cat("Verifying EHI domain-specific mean calculations\n\n") - -cat("--- FIRST 5 ROWS: 5-YEAR PREFERENCES MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5_pref_read[i], data$ehi5_pref_music[i], - data$ehi5_pref_TV[i], data$ehi5_pref_nap[i], - data$ehi5_pref_travel[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5_pref_MEAN[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 5-YEAR PERSONALITY MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5_pers_extravert[i], data$ehi5_pers_critical[i], - data$ehi5_pers_dependable[i], data$ehi5_pers_anxious[i], - data$ehi5_pers_complex[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5_pers_MEAN[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 10-YEAR PREFERENCES MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi10_pref_read[i], data$ehi10_pref_music[i], - data$ehi10_pref_TV[i], data$ehi10_pref_nap[i], - data$ehi10_pref_travel[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi10_pref_MEAN[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 5-10 YEAR CHANGE PREFERENCES MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5.10_pref_read[i], data$ehi5.10_pref_music[i], - data$ehi5.10_pref_TV[i], data$ehi5.10_pref_nap[i], - data$ehi5.10_pref_travel[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5.10_pref_MEAN[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], vals[4], vals[5], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -# Overall QA check for all rows -cat("\n--- OVERALL QA CHECK (ALL ROWS) ---\n") - -qa_checks <- list( - # 5-year means - list(vars = c("ehi5_pref_read", "ehi5_pref_music", "ehi5_pref_TV", "ehi5_pref_nap", "ehi5_pref_travel"), - target = "ehi5_pref_MEAN", name = "5-Year Preferences"), - list(vars = c("ehi5_pers_extravert", "ehi5_pers_critical", "ehi5_pers_dependable", "ehi5_pers_anxious", "ehi5_pers_complex"), - target = "ehi5_pers_MEAN", name = "5-Year Personality"), - list(vars = c("ehi5_val_obey", "ehi5_val_trad", "ehi5_val_opinion", "ehi5_val_performance", "ehi5_val_justice"), - target = "ehi5_val_MEAN", name = "5-Year Values"), - - # 10-year means - list(vars = c("ehi10_pref_read", "ehi10_pref_music", "ehi10_pref_TV", "ehi10_pref_nap", "ehi10_pref_travel"), - target = "ehi10_pref_MEAN", name = "10-Year Preferences"), - list(vars = c("ehi10_pers_extravert", "ehi10_pers_critical", "ehi10_pers_dependable", "ehi10_pers_anxious", "ehi10_pers_complex"), - target = "ehi10_pers_MEAN", name = "10-Year Personality"), - list(vars = c("ehi10_val_obey", "ehi10_val_trad", "ehi10_val_opinion", "ehi10_val_performance", "ehi10_val_justice"), - target = "ehi10_val_MEAN", name = "10-Year Values"), - - # 5-10 year change means - list(vars = c("ehi5.10_pref_read", "ehi5.10_pref_music", "ehi5.10_pref_TV", "ehi5.10_pref_nap", "ehi5.10_pref_travel"), - target = "ehi5.10_pref_MEAN", name = "5-10 Year Change Preferences"), - list(vars = c("ehi5.10_pers_extravert", "ehi5.10_pers_critical", "ehi5.10_pers_dependable", "ehi5.10_pers_anxious", "ehi5.10_pers_complex"), - target = "ehi5.10_pers_MEAN", name = "5-10 Year Change Personality"), - list(vars = c("ehi5.10_val_obey", "ehi5.10_val_trad", "ehi5.10_val_opinion", "ehi5.10_val_performance", "ehi5.10_val_justice"), - target = "ehi5.10_val_MEAN", name = "5-10 Year Change Values") -) - -all_checks_passed <- TRUE - -for (check in qa_checks) { - calc_mean <- rowMeans(data[, check$vars], na.rm = TRUE) - actual_mean <- data[[check$target]] - discrepancies <- which(abs(calc_mean - actual_mean) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s mean (n_vars = %d)\n", check$name, length(check$vars))) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s mean (n_vars = %d, n_rows = %d)\n", - check$name, length(check$vars), nrow(data))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(data, "eohi2.csv", row.names = FALSE) -cat("\nDataset saved to eohi2.csv\n") \ No newline at end of file diff --git a/.history/eohi2/datap 14 - all ehi global means_20251008171120.r b/.history/eohi2/datap 14 - all ehi global means_20251008171120.r deleted file mode 100644 index ee10ed8..0000000 --- a/.history/eohi2/datap 14 - all ehi global means_20251008171120.r +++ /dev/null @@ -1,150 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load data -data <- read.csv("eohi2.csv") - -# Calculate global mean scores for EHI variables across time intervals - -# === DGEN 5-YEAR GLOBAL MEAN === -data$ehiDGEN_5_mean <- rowMeans(data[, c("ehiDGEN_5_Pref", "ehiDGEN_5_Pers", - "ehiDGEN_5_Val")], na.rm = TRUE) - -# === DGEN 10-YEAR GLOBAL MEAN === -data$ehiDGEN_10_mean <- rowMeans(data[, c("ehiDGEN_10_Pref", "ehiDGEN_10_Pers", - "ehiDGEN_10_Val")], na.rm = TRUE) - -# === 5-YEAR GLOBAL MEAN === -data$ehi5_global_mean <- rowMeans(data[, c("ehi5_pref_MEAN", "ehi5_pers_MEAN", - "ehi5_val_MEAN")], na.rm = TRUE) - -# === 10-YEAR GLOBAL MEAN === -data$ehi10_global_mean <- rowMeans(data[, c("ehi10_pref_MEAN", "ehi10_pers_MEAN", - "ehi10_val_MEAN")], na.rm = TRUE) - -# === 5-10 YEAR CHANGE GLOBAL MEAN === -data$ehi5.10_global_mean <- rowMeans(data[, c("ehi5.10_pref_MEAN", "ehi5.10_pers_MEAN", - "ehi5.10_val_MEAN")], na.rm = TRUE) - -# QA: Verify mean calculations -cat("\n=== QUALITY ASSURANCE CHECK ===\n") -cat("Verifying EHI global mean calculations\n\n") - -cat("--- FIRST 5 ROWS: DGEN 5-YEAR GLOBAL MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehiDGEN_5_Pref[i], data$ehiDGEN_5_Pers[i], - data$ehiDGEN_5_Val[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehiDGEN_5_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: DGEN 10-YEAR GLOBAL MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehiDGEN_10_Pref[i], data$ehiDGEN_10_Pers[i], - data$ehiDGEN_10_Val[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehiDGEN_10_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 5-YEAR GLOBAL MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5_pref_MEAN[i], data$ehi5_pers_MEAN[i], - data$ehi5_val_MEAN[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5_global_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%.5f, %.5f, %.5f] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 10-YEAR GLOBAL MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi10_pref_MEAN[i], data$ehi10_pers_MEAN[i], - data$ehi10_val_MEAN[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi10_global_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%.5f, %.5f, %.5f] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 5-10 YEAR CHANGE GLOBAL MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5.10_pref_MEAN[i], data$ehi5.10_pers_MEAN[i], - data$ehi5.10_val_MEAN[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5.10_global_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%.5f, %.5f, %.5f] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -# Overall QA check for all rows -cat("\n--- OVERALL QA CHECK (ALL ROWS) ---\n") - -qa_checks <- list( - # 5-year means - list(vars = c("ehi5_pref_read", "ehi5_pref_music", "ehi5_pref_TV", "ehi5_pref_nap", "ehi5_pref_travel"), - target = "ehi5_pref_MEAN", name = "5-Year Preferences"), - list(vars = c("ehi5_pers_extravert", "ehi5_pers_critical", "ehi5_pers_dependable", "ehi5_pers_anxious", "ehi5_pers_complex"), - target = "ehi5_pers_MEAN", name = "5-Year Personality"), - list(vars = c("ehi5_val_obey", "ehi5_val_trad", "ehi5_val_opinion", "ehi5_val_performance", "ehi5_val_justice"), - target = "ehi5_val_MEAN", name = "5-Year Values"), - - # 10-year means - list(vars = c("ehi10_pref_read", "ehi10_pref_music", "ehi10_pref_TV", "ehi10_pref_nap", "ehi10_pref_travel"), - target = "ehi10_pref_MEAN", name = "10-Year Preferences"), - list(vars = c("ehi10_pers_extravert", "ehi10_pers_critical", "ehi10_pers_dependable", "ehi10_pers_anxious", "ehi10_pers_complex"), - target = "ehi10_pers_MEAN", name = "10-Year Personality"), - list(vars = c("ehi10_val_obey", "ehi10_val_trad", "ehi10_val_opinion", "ehi10_val_performance", "ehi10_val_justice"), - target = "ehi10_val_MEAN", name = "10-Year Values"), - - # 5-10 year change means - list(vars = c("ehi5.10_pref_read", "ehi5.10_pref_music", "ehi5.10_pref_TV", "ehi5.10_pref_nap", "ehi5.10_pref_travel"), - target = "ehi5.10_pref_MEAN", name = "5-10 Year Change Preferences"), - list(vars = c("ehi5.10_pers_extravert", "ehi5.10_pers_critical", "ehi5.10_pers_dependable", "ehi5.10_pers_anxious", "ehi5.10_pers_complex"), - target = "ehi5.10_pers_MEAN", name = "5-10 Year Change Personality"), - list(vars = c("ehi5.10_val_obey", "ehi5.10_val_trad", "ehi5.10_val_opinion", "ehi5.10_val_performance", "ehi5.10_val_justice"), - target = "ehi5.10_val_MEAN", name = "5-10 Year Change Values") -) - -all_checks_passed <- TRUE - -for (check in qa_checks) { - calc_mean <- rowMeans(data[, check$vars], na.rm = TRUE) - actual_mean <- data[[check$target]] - discrepancies <- which(abs(calc_mean - actual_mean) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s mean (n_vars = %d)\n", check$name, length(check$vars))) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s mean (n_vars = %d, n_rows = %d)\n", - check$name, length(check$vars), nrow(data))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(data, "eohi2.csv", row.names = FALSE) -cat("\nDataset saved to eohi2.csv\n") \ No newline at end of file diff --git a/.history/eohi2/datap 14 - all ehi global means_20251008171136.r b/.history/eohi2/datap 14 - all ehi global means_20251008171136.r deleted file mode 100644 index d1fa875..0000000 --- a/.history/eohi2/datap 14 - all ehi global means_20251008171136.r +++ /dev/null @@ -1,140 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load data -data <- read.csv("eohi2.csv") - -# Calculate global mean scores for EHI variables across time intervals - -# === DGEN 5-YEAR GLOBAL MEAN === -data$ehiDGEN_5_mean <- rowMeans(data[, c("ehiDGEN_5_Pref", "ehiDGEN_5_Pers", - "ehiDGEN_5_Val")], na.rm = TRUE) - -# === DGEN 10-YEAR GLOBAL MEAN === -data$ehiDGEN_10_mean <- rowMeans(data[, c("ehiDGEN_10_Pref", "ehiDGEN_10_Pers", - "ehiDGEN_10_Val")], na.rm = TRUE) - -# === 5-YEAR GLOBAL MEAN === -data$ehi5_global_mean <- rowMeans(data[, c("ehi5_pref_MEAN", "ehi5_pers_MEAN", - "ehi5_val_MEAN")], na.rm = TRUE) - -# === 10-YEAR GLOBAL MEAN === -data$ehi10_global_mean <- rowMeans(data[, c("ehi10_pref_MEAN", "ehi10_pers_MEAN", - "ehi10_val_MEAN")], na.rm = TRUE) - -# === 5-10 YEAR CHANGE GLOBAL MEAN === -data$ehi5.10_global_mean <- rowMeans(data[, c("ehi5.10_pref_MEAN", "ehi5.10_pers_MEAN", - "ehi5.10_val_MEAN")], na.rm = TRUE) - -# QA: Verify mean calculations -cat("\n=== QUALITY ASSURANCE CHECK ===\n") -cat("Verifying EHI global mean calculations\n\n") - -cat("--- FIRST 5 ROWS: DGEN 5-YEAR GLOBAL MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehiDGEN_5_Pref[i], data$ehiDGEN_5_Pers[i], - data$ehiDGEN_5_Val[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehiDGEN_5_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: DGEN 10-YEAR GLOBAL MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehiDGEN_10_Pref[i], data$ehiDGEN_10_Pers[i], - data$ehiDGEN_10_Val[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehiDGEN_10_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 5-YEAR GLOBAL MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5_pref_MEAN[i], data$ehi5_pers_MEAN[i], - data$ehi5_val_MEAN[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5_global_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%.5f, %.5f, %.5f] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 10-YEAR GLOBAL MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi10_pref_MEAN[i], data$ehi10_pers_MEAN[i], - data$ehi10_val_MEAN[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi10_global_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%.5f, %.5f, %.5f] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 5-10 YEAR CHANGE GLOBAL MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5.10_pref_MEAN[i], data$ehi5.10_pers_MEAN[i], - data$ehi5.10_val_MEAN[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5.10_global_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%.5f, %.5f, %.5f] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -# Overall QA check for all rows -cat("\n--- OVERALL QA CHECK (ALL ROWS) ---\n") - -qa_checks <- list( - # DGEN global means - list(vars = c("ehiDGEN_5_Pref", "ehiDGEN_5_Pers", "ehiDGEN_5_Val"), - target = "ehiDGEN_5_mean", name = "DGEN 5-Year Global"), - list(vars = c("ehiDGEN_10_Pref", "ehiDGEN_10_Pers", "ehiDGEN_10_Val"), - target = "ehiDGEN_10_mean", name = "DGEN 10-Year Global"), - - # Domain-specific global means - list(vars = c("ehi5_pref_MEAN", "ehi5_pers_MEAN", "ehi5_val_MEAN"), - target = "ehi5_global_mean", name = "5-Year Global"), - list(vars = c("ehi10_pref_MEAN", "ehi10_pers_MEAN", "ehi10_val_MEAN"), - target = "ehi10_global_mean", name = "10-Year Global"), - list(vars = c("ehi5.10_pref_MEAN", "ehi5.10_pers_MEAN", "ehi5.10_val_MEAN"), - target = "ehi5.10_global_mean", name = "5-10 Year Change Global") -) - -all_checks_passed <- TRUE - -for (check in qa_checks) { - calc_mean <- rowMeans(data[, check$vars], na.rm = TRUE) - actual_mean <- data[[check$target]] - discrepancies <- which(abs(calc_mean - actual_mean) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s mean (n_vars = %d)\n", check$name, length(check$vars))) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s mean (n_vars = %d, n_rows = %d)\n", - check$name, length(check$vars), nrow(data))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(data, "eohi2.csv", row.names = FALSE) -cat("\nDataset saved to eohi2.csv\n") \ No newline at end of file diff --git a/.history/eohi2/datap 14 - all ehi global means_20251008171157.r b/.history/eohi2/datap 14 - all ehi global means_20251008171157.r deleted file mode 100644 index d1fa875..0000000 --- a/.history/eohi2/datap 14 - all ehi global means_20251008171157.r +++ /dev/null @@ -1,140 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load data -data <- read.csv("eohi2.csv") - -# Calculate global mean scores for EHI variables across time intervals - -# === DGEN 5-YEAR GLOBAL MEAN === -data$ehiDGEN_5_mean <- rowMeans(data[, c("ehiDGEN_5_Pref", "ehiDGEN_5_Pers", - "ehiDGEN_5_Val")], na.rm = TRUE) - -# === DGEN 10-YEAR GLOBAL MEAN === -data$ehiDGEN_10_mean <- rowMeans(data[, c("ehiDGEN_10_Pref", "ehiDGEN_10_Pers", - "ehiDGEN_10_Val")], na.rm = TRUE) - -# === 5-YEAR GLOBAL MEAN === -data$ehi5_global_mean <- rowMeans(data[, c("ehi5_pref_MEAN", "ehi5_pers_MEAN", - "ehi5_val_MEAN")], na.rm = TRUE) - -# === 10-YEAR GLOBAL MEAN === -data$ehi10_global_mean <- rowMeans(data[, c("ehi10_pref_MEAN", "ehi10_pers_MEAN", - "ehi10_val_MEAN")], na.rm = TRUE) - -# === 5-10 YEAR CHANGE GLOBAL MEAN === -data$ehi5.10_global_mean <- rowMeans(data[, c("ehi5.10_pref_MEAN", "ehi5.10_pers_MEAN", - "ehi5.10_val_MEAN")], na.rm = TRUE) - -# QA: Verify mean calculations -cat("\n=== QUALITY ASSURANCE CHECK ===\n") -cat("Verifying EHI global mean calculations\n\n") - -cat("--- FIRST 5 ROWS: DGEN 5-YEAR GLOBAL MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehiDGEN_5_Pref[i], data$ehiDGEN_5_Pers[i], - data$ehiDGEN_5_Val[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehiDGEN_5_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: DGEN 10-YEAR GLOBAL MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehiDGEN_10_Pref[i], data$ehiDGEN_10_Pers[i], - data$ehiDGEN_10_Val[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehiDGEN_10_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 5-YEAR GLOBAL MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5_pref_MEAN[i], data$ehi5_pers_MEAN[i], - data$ehi5_val_MEAN[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5_global_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%.5f, %.5f, %.5f] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 10-YEAR GLOBAL MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi10_pref_MEAN[i], data$ehi10_pers_MEAN[i], - data$ehi10_val_MEAN[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi10_global_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%.5f, %.5f, %.5f] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 5-10 YEAR CHANGE GLOBAL MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5.10_pref_MEAN[i], data$ehi5.10_pers_MEAN[i], - data$ehi5.10_val_MEAN[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5.10_global_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%.5f, %.5f, %.5f] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -# Overall QA check for all rows -cat("\n--- OVERALL QA CHECK (ALL ROWS) ---\n") - -qa_checks <- list( - # DGEN global means - list(vars = c("ehiDGEN_5_Pref", "ehiDGEN_5_Pers", "ehiDGEN_5_Val"), - target = "ehiDGEN_5_mean", name = "DGEN 5-Year Global"), - list(vars = c("ehiDGEN_10_Pref", "ehiDGEN_10_Pers", "ehiDGEN_10_Val"), - target = "ehiDGEN_10_mean", name = "DGEN 10-Year Global"), - - # Domain-specific global means - list(vars = c("ehi5_pref_MEAN", "ehi5_pers_MEAN", "ehi5_val_MEAN"), - target = "ehi5_global_mean", name = "5-Year Global"), - list(vars = c("ehi10_pref_MEAN", "ehi10_pers_MEAN", "ehi10_val_MEAN"), - target = "ehi10_global_mean", name = "10-Year Global"), - list(vars = c("ehi5.10_pref_MEAN", "ehi5.10_pers_MEAN", "ehi5.10_val_MEAN"), - target = "ehi5.10_global_mean", name = "5-10 Year Change Global") -) - -all_checks_passed <- TRUE - -for (check in qa_checks) { - calc_mean <- rowMeans(data[, check$vars], na.rm = TRUE) - actual_mean <- data[[check$target]] - discrepancies <- which(abs(calc_mean - actual_mean) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s mean (n_vars = %d)\n", check$name, length(check$vars))) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s mean (n_vars = %d, n_rows = %d)\n", - check$name, length(check$vars), nrow(data))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(data, "eohi2.csv", row.names = FALSE) -cat("\nDataset saved to eohi2.csv\n") \ No newline at end of file diff --git a/.history/eohi2/datap 14 - all ehi global means_20251008171250.r b/.history/eohi2/datap 14 - all ehi global means_20251008171250.r deleted file mode 100644 index d1fa875..0000000 --- a/.history/eohi2/datap 14 - all ehi global means_20251008171250.r +++ /dev/null @@ -1,140 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Load data -data <- read.csv("eohi2.csv") - -# Calculate global mean scores for EHI variables across time intervals - -# === DGEN 5-YEAR GLOBAL MEAN === -data$ehiDGEN_5_mean <- rowMeans(data[, c("ehiDGEN_5_Pref", "ehiDGEN_5_Pers", - "ehiDGEN_5_Val")], na.rm = TRUE) - -# === DGEN 10-YEAR GLOBAL MEAN === -data$ehiDGEN_10_mean <- rowMeans(data[, c("ehiDGEN_10_Pref", "ehiDGEN_10_Pers", - "ehiDGEN_10_Val")], na.rm = TRUE) - -# === 5-YEAR GLOBAL MEAN === -data$ehi5_global_mean <- rowMeans(data[, c("ehi5_pref_MEAN", "ehi5_pers_MEAN", - "ehi5_val_MEAN")], na.rm = TRUE) - -# === 10-YEAR GLOBAL MEAN === -data$ehi10_global_mean <- rowMeans(data[, c("ehi10_pref_MEAN", "ehi10_pers_MEAN", - "ehi10_val_MEAN")], na.rm = TRUE) - -# === 5-10 YEAR CHANGE GLOBAL MEAN === -data$ehi5.10_global_mean <- rowMeans(data[, c("ehi5.10_pref_MEAN", "ehi5.10_pers_MEAN", - "ehi5.10_val_MEAN")], na.rm = TRUE) - -# QA: Verify mean calculations -cat("\n=== QUALITY ASSURANCE CHECK ===\n") -cat("Verifying EHI global mean calculations\n\n") - -cat("--- FIRST 5 ROWS: DGEN 5-YEAR GLOBAL MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehiDGEN_5_Pref[i], data$ehiDGEN_5_Pers[i], - data$ehiDGEN_5_Val[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehiDGEN_5_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: DGEN 10-YEAR GLOBAL MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehiDGEN_10_Pref[i], data$ehiDGEN_10_Pers[i], - data$ehiDGEN_10_Val[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehiDGEN_10_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%g, %g, %g] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 5-YEAR GLOBAL MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5_pref_MEAN[i], data$ehi5_pers_MEAN[i], - data$ehi5_val_MEAN[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5_global_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%.5f, %.5f, %.5f] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 10-YEAR GLOBAL MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi10_pref_MEAN[i], data$ehi10_pers_MEAN[i], - data$ehi10_val_MEAN[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi10_global_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%.5f, %.5f, %.5f] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -cat("\n--- FIRST 5 ROWS: 5-10 YEAR CHANGE GLOBAL MEAN ---\n") -for (i in 1:5) { - vals <- c(data$ehi5.10_pref_MEAN[i], data$ehi5.10_pers_MEAN[i], - data$ehi5.10_val_MEAN[i]) - calc_mean <- mean(vals, na.rm = TRUE) - actual_mean <- data$ehi5.10_global_mean[i] - match <- abs(calc_mean - actual_mean) < 1e-10 - cat(sprintf("Row %d: [%.5f, %.5f, %.5f] → Calculated: %.5f | Actual: %.5f %s\n", - i, vals[1], vals[2], vals[3], - calc_mean, actual_mean, ifelse(match, "✓", "✗"))) -} - -# Overall QA check for all rows -cat("\n--- OVERALL QA CHECK (ALL ROWS) ---\n") - -qa_checks <- list( - # DGEN global means - list(vars = c("ehiDGEN_5_Pref", "ehiDGEN_5_Pers", "ehiDGEN_5_Val"), - target = "ehiDGEN_5_mean", name = "DGEN 5-Year Global"), - list(vars = c("ehiDGEN_10_Pref", "ehiDGEN_10_Pers", "ehiDGEN_10_Val"), - target = "ehiDGEN_10_mean", name = "DGEN 10-Year Global"), - - # Domain-specific global means - list(vars = c("ehi5_pref_MEAN", "ehi5_pers_MEAN", "ehi5_val_MEAN"), - target = "ehi5_global_mean", name = "5-Year Global"), - list(vars = c("ehi10_pref_MEAN", "ehi10_pers_MEAN", "ehi10_val_MEAN"), - target = "ehi10_global_mean", name = "10-Year Global"), - list(vars = c("ehi5.10_pref_MEAN", "ehi5.10_pers_MEAN", "ehi5.10_val_MEAN"), - target = "ehi5.10_global_mean", name = "5-10 Year Change Global") -) - -all_checks_passed <- TRUE - -for (check in qa_checks) { - calc_mean <- rowMeans(data[, check$vars], na.rm = TRUE) - actual_mean <- data[[check$target]] - discrepancies <- which(abs(calc_mean - actual_mean) > 1e-10) - - if (length(discrepancies) > 0) { - cat(sprintf("FAIL: %s mean (n_vars = %d)\n", check$name, length(check$vars))) - cat(sprintf(" Found %d discrepancies in rows: %s\n", - length(discrepancies), - paste(head(discrepancies, 10), collapse = ", "))) - all_checks_passed <- FALSE - } else { - cat(sprintf("PASS: %s mean (n_vars = %d, n_rows = %d)\n", - check$name, length(check$vars), nrow(data))) - } -} - -cat("\n") -if (all_checks_passed) { - cat("*** ALL QA CHECKS PASSED ***\n") -} else { - cat("*** SOME QA CHECKS FAILED - REVIEW ABOVE ***\n") -} - -# Save updated dataset -write.csv(data, "eohi2.csv", row.names = FALSE) -cat("\nDataset saved to eohi2.csv\n") \ No newline at end of file diff --git a/.history/eohi2/datap 15 - education recoded ordinal 3_20251027135156.r b/.history/eohi2/datap 15 - education recoded ordinal 3_20251027135156.r deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi2/datap 15 - education recoded ordinal 3_20251027135157.r b/.history/eohi2/datap 15 - education recoded ordinal 3_20251027135157.r deleted file mode 100644 index 7d647e3..0000000 --- a/.history/eohi2/datap 15 - education recoded ordinal 3_20251027135157.r +++ /dev/null @@ -1,38 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -data <- read.csv("ehi1.csv") - -# Check the levels of the demo_edu variable -print(levels(factor(data$demo_edu))) - -# Also show the unique values and their frequencies -print("\nUnique values and frequencies:") -print(table(data$demo_edu, useNA = "ifany")) - -# Recode demo_edu into 3 ordinal levels -data$edu3 <- NA - -# HS_TS: High School and Trade School -data$edu3[data$demo_edu %in% c("High School (or equivalent)", "Trade School (non-military)")] <- "HS_TS" - -# C_Ug: College and University - Undergraduate -data$edu3[data$demo_edu %in% c("College Diploma/Certificate", "University - Undergraduate")] <- "C_Ug" - -# grad_prof: University - Graduate, University - PhD, and Professional Degree -data$edu3[data$demo_edu %in% c("University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)")] <- "grad_prof" - -# Convert to ordered factor -data$edu3 <- factor(data$edu3, - levels = c("HS_TS", "C_Ug", "grad_prof"), - ordered = TRUE) - -# Check the recoded variable -print(table(data$edu3, useNA = "ifany")) - -# Verify the recoding -print(table(data$demo_edu, data$edu3, useNA = "ifany")) - -# Save the updated dataset with the new edu3 variable -write.csv(data, "ehi1.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi2/datap 15 - education recoded ordinal 3_20251027141418.r b/.history/eohi2/datap 15 - education recoded ordinal 3_20251027141418.r deleted file mode 100644 index 3b820ba..0000000 --- a/.history/eohi2/datap 15 - education recoded ordinal 3_20251027141418.r +++ /dev/null @@ -1,38 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -data <- read.csv("eohi2.csv") - -# Check the levels of the demo_edu variable -print(levels(factor(data$demo_edu))) - -# Also show the unique values and their frequencies -print("\nUnique values and frequencies:") -print(table(data$demo_edu, useNA = "ifany")) - -# Recode demo_edu into 3 ordinal levels -data$edu3 <- NA - -# HS_TS: High School and Trade School -data$edu3[data$demo_edu %in% c("High School (or equivalent)", "Trade School (non-military)")] <- "HS_TS" - -# C_Ug: College and University - Undergraduate -data$edu3[data$demo_edu %in% c("College Diploma/Certificate", "University - Undergraduate")] <- "C_Ug" - -# grad_prof: University - Graduate, University - PhD, and Professional Degree -data$edu3[data$demo_edu %in% c("University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)")] <- "grad_prof" - -# Convert to ordered factor -data$edu3 <- factor(data$edu3, - levels = c("HS_TS", "C_Ug", "grad_prof"), - ordered = TRUE) - -# Check the recoded variable -print(table(data$edu3, useNA = "ifany")) - -# Verify the recoding -print(table(data$demo_edu, data$edu3, useNA = "ifany")) - -# Save the updated dataset with the new edu3 variable -# write.csv(data, "eohi2.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi2/datap 15 - education recoded ordinal 3_20251027143845.r b/.history/eohi2/datap 15 - education recoded ordinal 3_20251027143845.r deleted file mode 100644 index 33ae8e7..0000000 --- a/.history/eohi2/datap 15 - education recoded ordinal 3_20251027143845.r +++ /dev/null @@ -1,38 +0,0 @@ -options(scipen = 999) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -data <- read.csv("eohi2.csv") - -# Check the levels of the demo_edu variable -print(levels(factor(data$demo_edu))) - -# Also show the unique values and their frequencies -print("\nUnique values and frequencies:") -print(table(data$demo_edu, useNA = "ifany")) - -# Recode demo_edu into 3 ordinal levels -data$edu3 <- NA - -# HS_TS: High School and Trade School -data$edu3[data$demo_edu %in% c("High School (or equivalent)", "Trade School (non-military)")] <- "HS_TS" - -# C_Ug: College and University - Undergraduate -data$edu3[data$demo_edu %in% c("College Diploma/Certificate", "University - Undergraduate")] <- "C_Ug" - -# grad_prof: University - Graduate, University - PhD, and Professional Degree -data$edu3[data$demo_edu %in% c("University - Graduate (Masters)", "University - PhD", "Professional Degree (ex. JD/MD)")] <- "grad_prof" - -# Convert to ordered factor -data$edu3 <- factor(data$edu3, - levels = c("HS_TS", "C_Ug", "grad_prof"), - ordered = TRUE) - -# Check the recoded variable -print(table(data$edu3, useNA = "ifany")) - -# Verify the recoding -print(table(data$demo_edu, data$edu3, useNA = "ifany")) - -# Save the updated dataset with the new edu3 variable -write.csv(data, "eohi2.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi2/datap 16 - ehi vars standardized _20251029120227.r b/.history/eohi2/datap 16 - ehi vars standardized _20251029120227.r deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi2/datap 16 - ehi vars standardized _20251029120228.r b/.history/eohi2/datap 16 - ehi vars standardized _20251029120228.r deleted file mode 100644 index 51562a9..0000000 --- a/.history/eohi2/datap 16 - ehi vars standardized _20251029120228.r +++ /dev/null @@ -1,7 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -df <- read.csv("eohi2.csv") \ No newline at end of file diff --git a/.history/eohi2/datap 16 - ehi vars standardized _20251029120234.r b/.history/eohi2/datap 16 - ehi vars standardized _20251029120234.r deleted file mode 100644 index 51562a9..0000000 --- a/.history/eohi2/datap 16 - ehi vars standardized _20251029120234.r +++ /dev/null @@ -1,7 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -df <- read.csv("eohi2.csv") \ No newline at end of file diff --git a/.history/eohi2/datap 16 - ehi vars standardized _20251029121728.r b/.history/eohi2/datap 16 - ehi vars standardized _20251029121728.r deleted file mode 100644 index b4ba92f..0000000 --- a/.history/eohi2/datap 16 - ehi vars standardized _20251029121728.r +++ /dev/null @@ -1,49 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -df <- read.csv("eohi2.csv") - -# Display means and standard deviations of non-standardized variables for manual checking -print(round(mean(df$ehiDGEN_5_mean, na.rm = TRUE), 5)) -print(round(sd(df$ehiDGEN_5_mean, na.rm = TRUE), 5)) -print(round(mean(df$ehiDGEN_10_mean, na.rm = TRUE), 5)) -print(round(sd(df$ehiDGEN_10_mean, na.rm = TRUE), 5)) -print(round(mean(df$ehi5_global_mean, na.rm = TRUE), 5)) -print(round(sd(df$ehi5_global_mean, na.rm = TRUE), 5)) -print(round(mean(df$ehi10_global_mean, na.rm = TRUE), 5)) -print(round(sd(df$ehi10_global_mean, na.rm = TRUE), 5)) - -# Calculate means and standard deviations for standardization -mean_DGEN_5 <- mean(df$ehiDGEN_5_mean, na.rm = TRUE) -sd_DGEN_5 <- sd(df$ehiDGEN_5_mean, na.rm = TRUE) - -mean_DGEN_10 <- mean(df$ehiDGEN_10_mean, na.rm = TRUE) -sd_DGEN_10 <- sd(df$ehiDGEN_10_mean, na.rm = TRUE) - -mean_DS_5 <- mean(df$ehi5_global_mean, na.rm = TRUE) -sd_DS_5 <- sd(df$ehi5_global_mean, na.rm = TRUE) - -mean_DS_10 <- mean(df$ehi10_global_mean, na.rm = TRUE) -sd_DS_10 <- sd(df$ehi10_global_mean, na.rm = TRUE) - -# Create standardized variables -df$stdDGEN_5 <- (df$ehiDGEN_5_mean - mean_DGEN_5) / sd_DGEN_5 -df$stdDGEN_10 <- (df$ehiDGEN_10_mean - mean_DGEN_10) / sd_DGEN_10 -df$stdDS_5 <- (df$ehi5_global_mean - mean_DS_5) / sd_DS_5 -df$stdDS_10 <- (df$ehi10_global_mean - mean_DS_10) / sd_DS_10 - -# Check that variables have been standardized -print(round(mean(df$stdDGEN_5, na.rm = TRUE), 5)) -print(round(sd(df$stdDGEN_5, na.rm = TRUE), 5)) -print(round(mean(df$stdDGEN_10, na.rm = TRUE), 5)) -print(round(sd(df$stdDGEN_10, na.rm = TRUE), 5)) -print(round(mean(df$stdDS_5, na.rm = TRUE), 5)) -print(round(sd(df$stdDS_5, na.rm = TRUE), 5)) -print(round(mean(df$stdDS_10, na.rm = TRUE), 5)) -print(round(sd(df$stdDS_10, na.rm = TRUE), 5)) - -# Calculate mean of standardized variables -df$stdEHI_mean <- rowMeans(df[, c("stdDGEN_5", "stdDGEN_10", "stdDS_5", "stdDS_10")], na.rm = TRUE) \ No newline at end of file diff --git a/.history/eohi2/datap 16 - ehi vars standardized _20251029122336.r b/.history/eohi2/datap 16 - ehi vars standardized _20251029122336.r deleted file mode 100644 index 2cb7082..0000000 --- a/.history/eohi2/datap 16 - ehi vars standardized _20251029122336.r +++ /dev/null @@ -1,99 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -df <- read.csv("eohi2.csv") - -# Display means and standard deviations of non-standardized variables for manual checking -print(round(mean(df$ehiDGEN_5_mean, na.rm = TRUE), 5)) -print(round(sd(df$ehiDGEN_5_mean, na.rm = TRUE), 5)) -print(round(mean(df$ehiDGEN_10_mean, na.rm = TRUE), 5)) -print(round(sd(df$ehiDGEN_10_mean, na.rm = TRUE), 5)) -print(round(mean(df$ehi5_global_mean, na.rm = TRUE), 5)) -print(round(sd(df$ehi5_global_mean, na.rm = TRUE), 5)) -print(round(mean(df$ehi10_global_mean, na.rm = TRUE), 5)) -print(round(sd(df$ehi10_global_mean, na.rm = TRUE), 5)) - -# Calculate means and standard deviations for standardization -mean_DGEN_5 <- mean(df$ehiDGEN_5_mean, na.rm = TRUE) -sd_DGEN_5 <- sd(df$ehiDGEN_5_mean, na.rm = TRUE) - -mean_DGEN_10 <- mean(df$ehiDGEN_10_mean, na.rm = TRUE) -sd_DGEN_10 <- sd(df$ehiDGEN_10_mean, na.rm = TRUE) - -mean_DS_5 <- mean(df$ehi5_global_mean, na.rm = TRUE) -sd_DS_5 <- sd(df$ehi5_global_mean, na.rm = TRUE) - -mean_DS_10 <- mean(df$ehi10_global_mean, na.rm = TRUE) -sd_DS_10 <- sd(df$ehi10_global_mean, na.rm = TRUE) - -# Create standardized variables -df$stdDGEN_5 <- (df$ehiDGEN_5_mean - mean_DGEN_5) / sd_DGEN_5 -df$stdDGEN_10 <- (df$ehiDGEN_10_mean - mean_DGEN_10) / sd_DGEN_10 -df$stdDS_5 <- (df$ehi5_global_mean - mean_DS_5) / sd_DS_5 -df$stdDS_10 <- (df$ehi10_global_mean - mean_DS_10) / sd_DS_10 - -# Check that variables have been standardized -print(round(mean(df$stdDGEN_5, na.rm = TRUE), 5)) -print(round(sd(df$stdDGEN_5, na.rm = TRUE), 5)) -print(round(mean(df$stdDGEN_10, na.rm = TRUE), 5)) -print(round(sd(df$stdDGEN_10, na.rm = TRUE), 5)) -print(round(mean(df$stdDS_5, na.rm = TRUE), 5)) -print(round(sd(df$stdDS_5, na.rm = TRUE), 5)) -print(round(mean(df$stdDS_10, na.rm = TRUE), 5)) -print(round(sd(df$stdDS_10, na.rm = TRUE), 5)) - -# Calculate mean of standardized variables -df$stdEHI_mean <- rowMeans(df[, c("stdDGEN_5", "stdDGEN_10", "stdDS_5", "stdDS_10")], na.rm = TRUE) - -#### check random 10 rows - -# Check 10 random rows to verify calculations -set.seed(123) # For reproducible random selection -random_rows <- sample(nrow(df), 10) - -cat("Checking 10 random rows:\n") -cat("Row | ehiDGEN_5_mean | stdDGEN_5 | Calculation | ehiDGEN_10_mean | stdDGEN_10 | Calculation\n") -cat("----|----------------|-----------|-------------|-----------------|------------|------------\n") - -for(i in random_rows) { - orig_5 <- df$ehiDGEN_5_mean[i] - std_5 <- df$stdDGEN_5[i] - calc_5 <- (orig_5 - mean_DGEN_5) / sd_DGEN_5 - - orig_10 <- df$ehiDGEN_10_mean[i] - std_10 <- df$stdDGEN_10[i] - calc_10 <- (orig_10 - mean_DGEN_10) / sd_DGEN_10 - - cat(sprintf("%3d | %13.5f | %9.5f | %11.5f | %15.5f | %10.5f | %11.5f\n", - i, orig_5, std_5, calc_5, orig_10, std_10, calc_10)) -} - -cat("\nRow | ehi5_global_mean | stdDS_5 | Calculation | ehi10_global_mean | stdDS_10 | Calculation\n") -cat("----|------------------|---------|-------------|-------------------|----------|------------\n") - -for(i in random_rows) { - orig_5 <- df$ehi5_global_mean[i] - std_5 <- df$stdDS_5[i] - calc_5 <- (orig_5 - mean_DS_5) / sd_DS_5 - - orig_10 <- df$ehi10_global_mean[i] - std_10 <- df$stdDS_10[i] - calc_10 <- (orig_10 - mean_DS_10) / sd_DS_10 - - cat(sprintf("%3d | %16.5f | %8.5f | %11.5f | %17.5f | %9.5f | %11.5f\n", - i, orig_5, std_5, calc_5, orig_10, std_10, calc_10)) -} - -# Show the final stdEHI_mean for these rows -cat("\nRow | stdEHI_mean | Manual calc\n") -cat("----|-------------|------------\n") -for(i in random_rows) { - manual_mean <- mean(c(df$stdDGEN_5[i], df$stdDGEN_10[i], df$stdDS_5[i], df$stdDS_10[i]), na.rm = TRUE) - cat(sprintf("%3d | %11.5f | %11.5f\n", i, df$stdEHI_mean[i], manual_mean)) -} - -# Write to CSV -write.csv(df, "eohi2.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi2/datap 16 - ehi vars standardized _20251029124145.r b/.history/eohi2/datap 16 - ehi vars standardized _20251029124145.r deleted file mode 100644 index 01e0592..0000000 --- a/.history/eohi2/datap 16 - ehi vars standardized _20251029124145.r +++ /dev/null @@ -1,100 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -df <- read.csv("eohi2.csv") - -# Display means and standard deviations of non-standardized variables for manual checking -print(round(mean(df$ehiDGEN_5_mean, na.rm = TRUE), 5)) -print(round(sd(df$ehiDGEN_5_mean, na.rm = TRUE), 5)) -print(round(mean(df$ehiDGEN_10_mean, na.rm = TRUE), 5)) -print(round(sd(df$ehiDGEN_10_mean, na.rm = TRUE), 5)) -print(round(mean(df$ehi5_global_mean, na.rm = TRUE), 5)) -print(round(sd(df$ehi5_global_mean, na.rm = TRUE), 5)) -print(round(mean(df$ehi10_global_mean, na.rm = TRUE), 5)) -print(round(sd(df$ehi10_global_mean, na.rm = TRUE), 5)) - -# Calculate means and standard deviations for standardization -mean_DGEN_5 <- mean(df$ehiDGEN_5_mean, na.rm = TRUE) -sd_DGEN_5 <- sd(df$ehiDGEN_5_mean, na.rm = TRUE) - -mean_DGEN_10 <- mean(df$ehiDGEN_10_mean, na.rm = TRUE) -sd_DGEN_10 <- sd(df$ehiDGEN_10_mean, na.rm = TRUE) - -mean_DS_5 <- mean(df$ehi5_global_mean, na.rm = TRUE) -sd_DS_5 <- sd(df$ehi5_global_mean, na.rm = TRUE) - -mean_DS_10 <- mean(df$ehi10_global_mean, na.rm = TRUE) -sd_DS_10 <- sd(df$ehi10_global_mean, na.rm = TRUE) - -# Create standardized variables -df$stdDGEN_5 <- (df$ehiDGEN_5_mean - mean_DGEN_5) / sd_DGEN_5 -df$stdDGEN_10 <- (df$ehiDGEN_10_mean - mean_DGEN_10) / sd_DGEN_10 -df$stdDS_5 <- (df$ehi5_global_mean - mean_DS_5) / sd_DS_5 -df$stdDS_10 <- (df$ehi10_global_mean - mean_DS_10) / sd_DS_10 - -# Check that variables have been standardized -print(round(mean(df$stdDGEN_5, na.rm = TRUE), 5)) -print(round(sd(df$stdDGEN_5, na.rm = TRUE), 5)) -print(round(mean(df$stdDGEN_10, na.rm = TRUE), 5)) -print(round(sd(df$stdDGEN_10, na.rm = TRUE), 5)) -print(round(mean(df$stdDS_5, na.rm = TRUE), 5)) -print(round(sd(df$stdDS_5, na.rm = TRUE), 5)) -print(round(mean(df$stdDS_10, na.rm = TRUE), 5)) -print(round(sd(df$stdDS_10, na.rm = TRUE), 5)) - -# Calculate mean of standardized variables -df$stdEHI_mean <- rowMeans(df[, c("stdDGEN_5", "stdDGEN_10", "stdDS_5", "stdDS_10")], na.rm = TRUE) - -#### check random 10 rows - -# Check 10 random rows to verify calculations -set.seed(123) # For reproducible random selection -random_rows <- sample(nrow(df), 10) - -cat("Checking 10 random rows:\n") -cat("Row | ehiDGEN_5_mean | stdDGEN_5 | Calculation | ehiDGEN_10_mean | stdDGEN_10 | Calculation\n") -cat("----|----------------|-----------|-------------|-----------------|------------|------------\n") - -for(i in random_rows) { - orig_5 <- df$ehiDGEN_5_mean[i] - std_5 <- df$stdDGEN_5[i] - calc_5 <- (orig_5 - mean_DGEN_5) / sd_DGEN_5 - - orig_10 <- df$ehiDGEN_10_mean[i] - std_10 <- df$stdDGEN_10[i] - calc_10 <- (orig_10 - mean_DGEN_10) / sd_DGEN_10 - - cat(sprintf("%3d | %13.5f | %9.5f | %11.5f | %15.5f | %10.5f | %11.5f\n", - i, orig_5, std_5, calc_5, orig_10, std_10, calc_10)) -} - -cat("\nRow | ehi5_global_mean | stdDS_5 | Calculation | ehi10_global_mean | stdDS_10 | Calculation\n") -cat("----|------------------|---------|-------------|-------------------|----------|------------\n") - -for(i in random_rows) { - orig_5 <- df$ehi5_global_mean[i] - std_5 <- df$stdDS_5[i] - calc_5 <- (orig_5 - mean_DS_5) / sd_DS_5 - - orig_10 <- df$ehi10_global_mean[i] - std_10 <- df$stdDS_10[i] - calc_10 <- (orig_10 - mean_DS_10) / sd_DS_10 - - cat(sprintf("%3d | %16.5f | %8.5f | %11.5f | %17.5f | %9.5f | %11.5f\n", - i, orig_5, std_5, calc_5, orig_10, std_10, calc_10)) -} - -# Show the final stdEHI_mean for these rows -cat("\nRow | stdEHI_mean | Manual calc\n") -cat("----|-------------|------------\n") -for(i in random_rows) { - manual_mean <- -0.042564413 -0.158849227 -1.444812436 -0.23426232 -0.470122099 -mean(c(df$stdDGEN_5[i], df$stdDGEN_10[i], df$stdDS_5[i], df$stdDS_10[i]), na.rm = TRUE) - cat(sprintf("%3d | %11.5f | %11.5f\n", i, df$stdEHI_mean[i], manual_mean)) -} - -# Write to CSV -write.csv(df, "eohi2.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi2/e2 - correlation matrix_20251027143921.r b/.history/eohi2/e2 - correlation matrix_20251027143921.r deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi2/e2 - correlation matrix_20251027143922.r b/.history/eohi2/e2 - correlation matrix_20251027143922.r deleted file mode 100644 index 4b1a8a7..0000000 --- a/.history/eohi2/e2 - correlation matrix_20251027143922.r +++ /dev/null @@ -1,105 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi1") - -df <- read.csv("ehi1.csv") - -data <- df %>% - select(eohiDGEN_mean, ehi_global_mean, demo_sex, demo_age_1, edu3, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(eohiDGEN_mean, ehi_global_mean, sex_dummy, demo_age_1, edu_num, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print(round(cor_matrix, 3)) - -# Get significance tests for correlations using psych package -library(psych) - -# Create correlation matrix with significance tests -cor_test <- corr.test(numeric_vars, method = "spearman", adjust = "none") - -# Print correlation matrix -print(round(cor_test$r, 3)) - -# Print p-values -print(round(cor_test$p, 3)) - -# Print all correlations with r and p values (for reporting) -for(i in 1:nrow(cor_test$r)) { - for(j in 1:ncol(cor_test$r)) { - if(i != j) { # Skip diagonal - cat(colnames(numeric_vars)[i], "vs", colnames(numeric_vars)[j], - ": r =", round(cor_test$r[i, j], 3), - ", p =", round(cor_test$p[i, j], 3), "\n") - } - } -} - -# Also print significant correlations summary -sig_cors <- which(cor_test$p < 0.05 & cor_test$p != 0, arr.ind = TRUE) -if(nrow(sig_cors) > 0) { - for(i in 1:nrow(sig_cors)) { - row_idx <- sig_cors[i, 1] - col_idx <- sig_cors[i, 2] - if(row_idx != col_idx) { # Skip diagonal - cat(colnames(numeric_vars)[row_idx], "vs", colnames(numeric_vars)[col_idx], - ": r =", round(cor_test$r[row_idx, col_idx], 3), - ", p =", round(cor_test$p[row_idx, col_idx], 3), "\n") - } - } -} - -# Save correlation matrix and p-values to CSV files -write.csv(cor_test$r, "correlation_matrix.csv", row.names = TRUE) -write.csv(cor_test$p, "correlation_pvalues.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") -print("P-values saved to correlation_pvalues.csv") \ No newline at end of file diff --git a/.history/eohi2/e2 - correlation matrix_20251027144718.r b/.history/eohi2/e2 - correlation matrix_20251027144718.r deleted file mode 100644 index a9995fe..0000000 --- a/.history/eohi2/e2 - correlation matrix_20251027144718.r +++ /dev/null @@ -1,105 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -df <- read.csv("eohi2.csv") - -data <- df %>% - select(ehiDGEN_5_mean, ehiDGEN_10_mean, ehi5_global_mean, ehi10_global_mean, demo_sex, demo_age_1, edu3, aot_total, crt_correct, crt_int) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - -# Center demo_age_1 (subtract the mean) -data$age_centered <- data$demo_age_1 - mean(data$demo_age_1, na.rm = TRUE) - -# Verify the centering -print(summary(data$age_centered)) - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(eohiDGEN_mean, ehi_global_mean, sex_dummy, demo_age_1, edu_num, AOT_total, CRT_correct, CRT_int, bs_28, bs_easy, bs_hard, cal_selfActual, cal_global) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print(round(cor_matrix, 3)) - -# Get significance tests for correlations using psych package -library(psych) - -# Create correlation matrix with significance tests -cor_test <- corr.test(numeric_vars, method = "spearman", adjust = "none") - -# Print correlation matrix -print(round(cor_test$r, 3)) - -# Print p-values -print(round(cor_test$p, 3)) - -# Print all correlations with r and p values (for reporting) -for(i in 1:nrow(cor_test$r)) { - for(j in 1:ncol(cor_test$r)) { - if(i != j) { # Skip diagonal - cat(colnames(numeric_vars)[i], "vs", colnames(numeric_vars)[j], - ": r =", round(cor_test$r[i, j], 3), - ", p =", round(cor_test$p[i, j], 3), "\n") - } - } -} - -# Also print significant correlations summary -sig_cors <- which(cor_test$p < 0.05 & cor_test$p != 0, arr.ind = TRUE) -if(nrow(sig_cors) > 0) { - for(i in 1:nrow(sig_cors)) { - row_idx <- sig_cors[i, 1] - col_idx <- sig_cors[i, 2] - if(row_idx != col_idx) { # Skip diagonal - cat(colnames(numeric_vars)[row_idx], "vs", colnames(numeric_vars)[col_idx], - ": r =", round(cor_test$r[row_idx, col_idx], 3), - ", p =", round(cor_test$p[row_idx, col_idx], 3), "\n") - } - } -} - -# Save correlation matrix and p-values to CSV files -write.csv(cor_test$r, "correlation_matrix.csv", row.names = TRUE) -write.csv(cor_test$p, "correlation_pvalues.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") -print("P-values saved to correlation_pvalues.csv") \ No newline at end of file diff --git a/.history/eohi2/e2 - correlation matrix_20251027145122.r b/.history/eohi2/e2 - correlation matrix_20251027145122.r deleted file mode 100644 index 755cd29..0000000 --- a/.history/eohi2/e2 - correlation matrix_20251027145122.r +++ /dev/null @@ -1,100 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -df <- read.csv("eohi2.csv") - -data <- df %>% - select(ehiDGEN_5_mean, ehiDGEN_10_mean, ehi5_global_mean, ehi10_global_mean, demo_sex, demo_age_1, edu3, aot_total, crt_correct, crt_int) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(ehiDGEN_5_mean, ehiDGEN_10_mean, ehi5_global_mean, ehi10_global_mean, sex_dummy, demo_age_1, edu_num, aot_total, crt_correct, crt_int) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print(round(cor_matrix, 3)) - -# Get significance tests for correlations using psych package -library(psych) - -# Create correlation matrix with significance tests -cor_test <- corr.test(numeric_vars, method = "spearman", adjust = "none") - -# Print correlation matrix -print(round(cor_test$r, 3)) - -# Print p-values -print(round(cor_test$p, 3)) - -# Print all correlations with r and p values (for reporting) -for(i in 1:nrow(cor_test$r)) { - for(j in 1:ncol(cor_test$r)) { - if(i != j) { # Skip diagonal - cat(colnames(numeric_vars)[i], "vs", colnames(numeric_vars)[j], - ": r =", round(cor_test$r[i, j], 3), - ", p =", round(cor_test$p[i, j], 3), "\n") - } - } -} - -# Also print significant correlations summary -sig_cors <- which(cor_test$p < 0.05 & cor_test$p != 0, arr.ind = TRUE) -if(nrow(sig_cors) > 0) { - for(i in 1:nrow(sig_cors)) { - row_idx <- sig_cors[i, 1] - col_idx <- sig_cors[i, 2] - if(row_idx != col_idx) { # Skip diagonal - cat(colnames(numeric_vars)[row_idx], "vs", colnames(numeric_vars)[col_idx], - ": r =", round(cor_test$r[row_idx, col_idx], 3), - ", p =", round(cor_test$p[row_idx, col_idx], 3), "\n") - } - } -} - -# Save correlation matrix and p-values to CSV files -write.csv(cor_test$r, "correlation_matrix.csv", row.names = TRUE) -write.csv(cor_test$p, "correlation_pvalues.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") -print("P-values saved to correlation_pvalues.csv") \ No newline at end of file diff --git a/.history/eohi2/e2 - correlation matrix_20251027145125.r b/.history/eohi2/e2 - correlation matrix_20251027145125.r deleted file mode 100644 index 755cd29..0000000 --- a/.history/eohi2/e2 - correlation matrix_20251027145125.r +++ /dev/null @@ -1,100 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -df <- read.csv("eohi2.csv") - -data <- df %>% - select(ehiDGEN_5_mean, ehiDGEN_10_mean, ehi5_global_mean, ehi10_global_mean, demo_sex, demo_age_1, edu3, aot_total, crt_correct, crt_int) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(ehiDGEN_5_mean, ehiDGEN_10_mean, ehi5_global_mean, ehi10_global_mean, sex_dummy, demo_age_1, edu_num, aot_total, crt_correct, crt_int) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print(round(cor_matrix, 3)) - -# Get significance tests for correlations using psych package -library(psych) - -# Create correlation matrix with significance tests -cor_test <- corr.test(numeric_vars, method = "spearman", adjust = "none") - -# Print correlation matrix -print(round(cor_test$r, 3)) - -# Print p-values -print(round(cor_test$p, 3)) - -# Print all correlations with r and p values (for reporting) -for(i in 1:nrow(cor_test$r)) { - for(j in 1:ncol(cor_test$r)) { - if(i != j) { # Skip diagonal - cat(colnames(numeric_vars)[i], "vs", colnames(numeric_vars)[j], - ": r =", round(cor_test$r[i, j], 3), - ", p =", round(cor_test$p[i, j], 3), "\n") - } - } -} - -# Also print significant correlations summary -sig_cors <- which(cor_test$p < 0.05 & cor_test$p != 0, arr.ind = TRUE) -if(nrow(sig_cors) > 0) { - for(i in 1:nrow(sig_cors)) { - row_idx <- sig_cors[i, 1] - col_idx <- sig_cors[i, 2] - if(row_idx != col_idx) { # Skip diagonal - cat(colnames(numeric_vars)[row_idx], "vs", colnames(numeric_vars)[col_idx], - ": r =", round(cor_test$r[row_idx, col_idx], 3), - ", p =", round(cor_test$p[row_idx, col_idx], 3), "\n") - } - } -} - -# Save correlation matrix and p-values to CSV files -write.csv(cor_test$r, "correlation_matrix.csv", row.names = TRUE) -write.csv(cor_test$p, "correlation_pvalues.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") -print("P-values saved to correlation_pvalues.csv") \ No newline at end of file diff --git a/.history/eohi2/e2 - correlation matrix_20251027145133.r b/.history/eohi2/e2 - correlation matrix_20251027145133.r deleted file mode 100644 index 755cd29..0000000 --- a/.history/eohi2/e2 - correlation matrix_20251027145133.r +++ /dev/null @@ -1,100 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -df <- read.csv("eohi2.csv") - -data <- df %>% - select(ehiDGEN_5_mean, ehiDGEN_10_mean, ehi5_global_mean, ehi10_global_mean, demo_sex, demo_age_1, edu3, aot_total, crt_correct, crt_int) %>% - filter(demo_sex != "Prefer not to say") - -print(colSums(is.na(data))) -print(sapply(data, class)) - -# Create dummy variable for sex (0 = Male, 1 = Female) -data$sex_dummy <- ifelse(data$demo_sex == "Female", 1, 0) - -# Verify the dummy coding -print(table(data$demo_sex, data$sex_dummy)) - -#descriptives - -# Descriptives for age -print(summary(data$demo_age_1)) -print(sd(data$demo_age_1, na.rm = TRUE)) - - -# Descriptives for sex (frequency table) -print(table(data$demo_sex)) -print(prop.table(table(data$demo_sex))) - -# Descriptives for sex dummy variable -print(table(data$sex_dummy)) - -# Convert edu3 to numeric factor for correlations (1, 2, 3) -# First ensure edu3 is a factor, then convert to numeric -data$edu3 <- factor(data$edu3, levels = c("HS_TS", "C_Ug", "grad_prof"), ordered = TRUE) -data$edu_num <- as.numeric(data$edu3) - -# Check the numeric conversion -print(table(data$edu_num, useNA = "ifany")) - -# Verify the conversion -print(table(data$edu3, data$edu_num, useNA = "ifany")) - -####correlation matrix #### - -# Select numeric variables for correlation matrix -numeric_vars <- data %>% - select(ehiDGEN_5_mean, ehiDGEN_10_mean, ehi5_global_mean, ehi10_global_mean, sex_dummy, demo_age_1, edu_num, aot_total, crt_correct, crt_int) - -# Create Spearman correlation matrix -cor_matrix <- cor(numeric_vars, use = "complete.obs", method = "spearman") - -# Print correlation matrix -print(round(cor_matrix, 3)) - -# Get significance tests for correlations using psych package -library(psych) - -# Create correlation matrix with significance tests -cor_test <- corr.test(numeric_vars, method = "spearman", adjust = "none") - -# Print correlation matrix -print(round(cor_test$r, 3)) - -# Print p-values -print(round(cor_test$p, 3)) - -# Print all correlations with r and p values (for reporting) -for(i in 1:nrow(cor_test$r)) { - for(j in 1:ncol(cor_test$r)) { - if(i != j) { # Skip diagonal - cat(colnames(numeric_vars)[i], "vs", colnames(numeric_vars)[j], - ": r =", round(cor_test$r[i, j], 3), - ", p =", round(cor_test$p[i, j], 3), "\n") - } - } -} - -# Also print significant correlations summary -sig_cors <- which(cor_test$p < 0.05 & cor_test$p != 0, arr.ind = TRUE) -if(nrow(sig_cors) > 0) { - for(i in 1:nrow(sig_cors)) { - row_idx <- sig_cors[i, 1] - col_idx <- sig_cors[i, 2] - if(row_idx != col_idx) { # Skip diagonal - cat(colnames(numeric_vars)[row_idx], "vs", colnames(numeric_vars)[col_idx], - ": r =", round(cor_test$r[row_idx, col_idx], 3), - ", p =", round(cor_test$p[row_idx, col_idx], 3), "\n") - } - } -} - -# Save correlation matrix and p-values to CSV files -write.csv(cor_test$r, "correlation_matrix.csv", row.names = TRUE) -write.csv(cor_test$p, "correlation_pvalues.csv", row.names = TRUE) -print("Correlation matrix saved to correlation_matrix.csv") -print("P-values saved to correlation_pvalues.csv") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251003144019.r b/.history/eohi2/mixed anova - DGEN_20251003144019.r deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi2/mixed anova - DGEN_20251003144020.r b/.history/eohi2/mixed anova - DGEN_20251003144020.r deleted file mode 100644 index cf12456..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251003144020.r +++ /dev/null @@ -1,21 +0,0 @@ -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251003150009.r b/.history/eohi2/mixed anova - DGEN_20251003150009.r deleted file mode 100644 index c8fa38b..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251003150009.r +++ /dev/null @@ -1,169 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, temporal_DO, interval_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(pID), - temporal_DO = as.factor(temporal_DO), - interval_DO = as.factor(interval_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251003150038.r b/.history/eohi2/mixed anova - DGEN_20251003150038.r deleted file mode 100644 index 396cca4..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251003150038.r +++ /dev/null @@ -1,360 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, temporal_DO, interval_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(pID), - temporal_DO = as.factor(temporal_DO), - interval_DO = as.factor(interval_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251003150047.r b/.history/eohi2/mixed anova - DGEN_20251003150047.r deleted file mode 100644 index 396cca4..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251003150047.r +++ /dev/null @@ -1,360 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, temporal_DO, interval_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(pID), - temporal_DO = as.factor(temporal_DO), - interval_DO = as.factor(interval_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251003150106.r b/.history/eohi2/mixed anova - DGEN_20251003150106.r deleted file mode 100644 index 472bcca..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251003150106.r +++ /dev/null @@ -1,466 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, temporal_DO, interval_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(pID), - temporal_DO = as.factor(temporal_DO), - interval_DO = as.factor(interval_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251003150128.r b/.history/eohi2/mixed anova - DGEN_20251003150128.r deleted file mode 100644 index 30f04f6..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251003150128.r +++ /dev/null @@ -1,466 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, temporal_DO, interval_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(pID), - temporal_DO = as.factor(temporal_DO), - interval_DO = as.factor(interval_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251003150130.r b/.history/eohi2/mixed anova - DGEN_20251003150130.r deleted file mode 100644 index 8eab899..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251003150130.r +++ /dev/null @@ -1,466 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, temporal_DO, interval_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(pID), - temporal_DO = as.factor(temporal_DO), - interval_DO = as.factor(interval_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251003150137.r b/.history/eohi2/mixed anova - DGEN_20251003150137.r deleted file mode 100644 index 9d3b58d..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251003150137.r +++ /dev/null @@ -1,466 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, temporal_DO, interval_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(pID), - temporal_DO = as.factor(temporal_DO), - interval_DO = as.factor(interval_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251003150143.r b/.history/eohi2/mixed anova - DGEN_20251003150143.r deleted file mode 100644 index 9d34a95..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251003150143.r +++ /dev/null @@ -1,735 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, temporal_DO, interval_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(pID), - temporal_DO = as.factor(temporal_DO), - interval_DO = as.factor(interval_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("DOMAIN = %s, INTERVAL = %s:\n", domain_level, interval_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") -print("The analysis includes:") -print("- Descriptive statistics by all factor combinations") -print("- Comprehensive assumption testing (normality, homogeneity of variance, Hartley's F-max)") -print("- Mixed ANOVA with sphericity corrections") -print("- Effect sizes (generalized eta squared)") -print("- Post-hoc comparisons with Bonferroni correction") -print("- Cohen's d calculations for significant effects") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251003150144.r b/.history/eohi2/mixed anova - DGEN_20251003150144.r deleted file mode 100644 index 9d34a95..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251003150144.r +++ /dev/null @@ -1,735 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, temporal_DO, interval_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(pID), - temporal_DO = as.factor(temporal_DO), - interval_DO = as.factor(interval_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("DOMAIN = %s, INTERVAL = %s:\n", domain_level, interval_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") -print("The analysis includes:") -print("- Descriptive statistics by all factor combinations") -print("- Comprehensive assumption testing (normality, homogeneity of variance, Hartley's F-max)") -print("- Mixed ANOVA with sphericity corrections") -print("- Effect sizes (generalized eta squared)") -print("- Post-hoc comparisons with Bonferroni correction") -print("- Cohen's d calculations for significant effects") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251003150214.r b/.history/eohi2/mixed anova - DGEN_20251003150214.r deleted file mode 100644 index 9d34a95..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251003150214.r +++ /dev/null @@ -1,735 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, temporal_DO, interval_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(pID), - temporal_DO = as.factor(temporal_DO), - interval_DO = as.factor(interval_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("DOMAIN = %s, INTERVAL = %s:\n", domain_level, interval_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") -print("The analysis includes:") -print("- Descriptive statistics by all factor combinations") -print("- Comprehensive assumption testing (normality, homogeneity of variance, Hartley's F-max)") -print("- Mixed ANOVA with sphericity corrections") -print("- Effect sizes (generalized eta squared)") -print("- Post-hoc comparisons with Bonferroni correction") -print("- Cohen's d calculations for significant effects") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251003150251.r b/.history/eohi2/mixed anova - DGEN_20251003150251.r deleted file mode 100644 index b56557e..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251003150251.r +++ /dev/null @@ -1,736 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, temporal_DO, interval_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(pID), - temporal_DO = as.factor(temporal_DO), - interval_DO = as.factor(interval_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - cat("Significant pairwise comparisons (p < 0.05):\n") - print(significant_pairs) - - cat("\nCohen's d calculated from raw data:\n") - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - cat(sprintf("Comparison: %s", contrast_name)) - if(group2_var %in% colnames(comparison)) { - cat(sprintf(" | %s", group2_level)) - } - cat(sprintf("\n n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat("\n") - } - } - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("DOMAIN = %s, INTERVAL = %s:\n", domain_level, interval_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") -print("The analysis includes:") -print("- Descriptive statistics by all factor combinations") -print("- Comprehensive assumption testing (normality, homogeneity of variance, Hartley's F-max)") -print("- Mixed ANOVA with sphericity corrections") -print("- Effect sizes (generalized eta squared)") -print("- Post-hoc comparisons with Bonferroni correction") -print("- Cohen's d calculations for significant effects") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251003150313.r b/.history/eohi2/mixed anova - DGEN_20251003150313.r deleted file mode 100644 index 883b33b..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251003150313.r +++ /dev/null @@ -1,747 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, temporal_DO, interval_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(pID), - temporal_DO = as.factor(temporal_DO), - interval_DO = as.factor(interval_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("DOMAIN = %s, INTERVAL = %s:\n", domain_level, interval_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") -print("The analysis includes:") -print("- Descriptive statistics by all factor combinations") -print("- Comprehensive assumption testing (normality, homogeneity of variance, Hartley's F-max)") -print("- Mixed ANOVA with sphericity corrections") -print("- Effect sizes (generalized eta squared)") -print("- Post-hoc comparisons with Bonferroni correction") -print("- Cohen's d calculations for significant effects") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251003150330.r b/.history/eohi2/mixed anova - DGEN_20251003150330.r deleted file mode 100644 index f117c81..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251003150330.r +++ /dev/null @@ -1,759 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, temporal_DO, interval_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(pID), - temporal_DO = as.factor(temporal_DO), - interval_DO = as.factor(interval_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("\n=== ANALYSIS COMPLETE ===") -print("All significant and marginal effects have been analyzed with Cohen's d calculations.") -print("The analysis includes:") -print("- Descriptive statistics by all factor combinations") -print("- Comprehensive assumption testing (normality, homogeneity of variance, Hartley's F-max)") -print("- Mixed ANOVA with sphericity corrections") -print("- Effect sizes (generalized eta squared)") -print("- Post-hoc comparisons with Bonferroni correction") -print("- Cohen's d calculations for significant effects") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251003150337.r b/.history/eohi2/mixed anova - DGEN_20251003150337.r deleted file mode 100644 index 6b53f74..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251003150337.r +++ /dev/null @@ -1,751 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, temporal_DO, interval_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(pID), - temporal_DO = as.factor(temporal_DO), - interval_DO = as.factor(interval_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251003150346.r b/.history/eohi2/mixed anova - DGEN_20251003150346.r deleted file mode 100644 index 6b53f74..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251003150346.r +++ /dev/null @@ -1,751 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, temporal_DO, interval_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(pID), - temporal_DO = as.factor(temporal_DO), - interval_DO = as.factor(interval_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251003152314.r b/.history/eohi2/mixed anova - DGEN_20251003152314.r deleted file mode 100644 index 6b53f74..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251003152314.r +++ /dev/null @@ -1,751 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(pID, ResponseId, temporal_DO, interval_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(pID), - temporal_DO = as.factor(temporal_DO), - interval_DO = as.factor(interval_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251003170643.r b/.history/eohi2/mixed anova - DGEN_20251003170643.r deleted file mode 100644 index 86b9512..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251003170643.r +++ /dev/null @@ -1,751 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$pID)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251003170646.r b/.history/eohi2/mixed anova - DGEN_20251003170646.r deleted file mode 100644 index 7ea184c..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251003170646.r +++ /dev/null @@ -1,751 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251003170651.r b/.history/eohi2/mixed anova - DGEN_20251003170651.r deleted file mode 100644 index 7ea184c..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251003170651.r +++ /dev/null @@ -1,751 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251003170811.r b/.history/eohi2/mixed anova - DGEN_20251003170811.r deleted file mode 100644 index 7ea184c..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251003170811.r +++ /dev/null @@ -1,751 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251003171114.r b/.history/eohi2/mixed anova - DGEN_20251003171114.r deleted file mode 100644 index 04b5eb1..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251003171114.r +++ /dev/null @@ -1,751 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251003171117.r b/.history/eohi2/mixed anova - DGEN_20251003171117.r deleted file mode 100644 index d83e68e..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251003171117.r +++ /dev/null @@ -1,751 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251003171121.r b/.history/eohi2/mixed anova - DGEN_20251003171121.r deleted file mode 100644 index d83e68e..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251003171121.r +++ /dev/null @@ -1,751 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251003171149.r b/.history/eohi2/mixed anova - DGEN_20251003171149.r deleted file mode 100644 index d83e68e..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251003171149.r +++ /dev/null @@ -1,751 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006125954.r b/.history/eohi2/mixed anova - DGEN_20251006125954.r deleted file mode 100644 index d83e68e..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006125954.r +++ /dev/null @@ -1,751 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006191142.r b/.history/eohi2/mixed anova - DGEN_20251006191142.r deleted file mode 100644 index d83e68e..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006191142.r +++ /dev/null @@ -1,751 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required DGEN variables found!") -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values", "Preferences", "Personality", "Values"), 2), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -# Variable mapping created -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006192531.r b/.history/eohi2/mixed anova - DGEN_20251006192531.r deleted file mode 100644 index 203e8dc..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006192531.r +++ /dev/null @@ -1,748 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "DGEN_5.10past_Pref", "DGEN_5.10past_Pers", "DGEN_5.10past_Val", - "DGEN_5.10fut_Pref", "DGEN_5.10fut_Pers", "DGEN_5.10fut_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006192540.r b/.history/eohi2/mixed anova - DGEN_20251006192540.r deleted file mode 100644 index 8ad4b4b..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006192540.r +++ /dev/null @@ -1,748 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "DGEN_5.10past_Pref", "DGEN_5.10past_Pers", "DGEN_5.10past_Val", - "DGEN_5.10fut_Pref", "DGEN_5.10fut_Pers", "DGEN_5.10fut_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$pID)))) -print("Factor levels:") -print(paste("TIME:", paste(levels(long_data$TIME), collapse = ", "))) -print(paste("DOMAIN:", paste(levels(long_data$DOMAIN), collapse = ", "))) -print(paste("INTERVAL:", paste(levels(long_data$INTERVAL), collapse = ", "))) -print(paste("temporal_DO:", paste(levels(long_data$temporal_DO), collapse = ", "))) -print(paste("interval_DO:", paste(levels(long_data$interval_DO), collapse = ", "))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006192548.r b/.history/eohi2/mixed anova - DGEN_20251006192548.r deleted file mode 100644 index 262803e..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006192548.r +++ /dev/null @@ -1,740 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "DGEN_5.10past_Pref", "DGEN_5.10past_Pers", "DGEN_5.10past_Val", - "DGEN_5.10fut_Pref", "DGEN_5.10fut_Pers", "DGEN_5.10fut_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006192554.r b/.history/eohi2/mixed anova - DGEN_20251006192554.r deleted file mode 100644 index c1af0c3..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006192554.r +++ /dev/null @@ -1,739 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "DGEN_5.10past_Pref", "DGEN_5.10past_Pers", "DGEN_5.10past_Val", - "DGEN_5.10fut_Pref", "DGEN_5.10fut_Pers", "DGEN_5.10fut_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$pID)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), temporal_DO (", - length(levels(long_data_clean$temporal_DO)), "), interval_DO (", - length(levels(long_data_clean$interval_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006192606.r b/.history/eohi2/mixed anova - DGEN_20251006192606.r deleted file mode 100644 index 0d91911..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006192606.r +++ /dev/null @@ -1,726 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "DGEN_5.10past_Pref", "DGEN_5.10past_Pers", "DGEN_5.10past_Val", - "DGEN_5.10fut_Pref", "DGEN_5.10fut_Pers", "DGEN_5.10fut_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006192619.r b/.history/eohi2/mixed anova - DGEN_20251006192619.r deleted file mode 100644 index 0569e40..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006192619.r +++ /dev/null @@ -1,724 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "DGEN_5.10past_Pref", "DGEN_5.10past_Pers", "DGEN_5.10past_Val", - "DGEN_5.10fut_Pref", "DGEN_5.10fut_Pers", "DGEN_5.10fut_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006192629.r b/.history/eohi2/mixed anova - DGEN_20251006192629.r deleted file mode 100644 index d77b2a9..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006192629.r +++ /dev/null @@ -1,723 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "DGEN_5.10past_Pref", "DGEN_5.10past_Pers", "DGEN_5.10past_Val", - "DGEN_5.10fut_Pref", "DGEN_5.10fut_Pers", "DGEN_5.10fut_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006192639.r b/.history/eohi2/mixed anova - DGEN_20251006192639.r deleted file mode 100644 index 825435e..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006192639.r +++ /dev/null @@ -1,719 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "DGEN_5.10past_Pref", "DGEN_5.10past_Pers", "DGEN_5.10past_Val", - "DGEN_5.10fut_Pref", "DGEN_5.10fut_Pers", "DGEN_5.10fut_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("\nMain Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("\nMain Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006192701.r b/.history/eohi2/mixed anova - DGEN_20251006192701.r deleted file mode 100644 index 3eb8e1b..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006192701.r +++ /dev/null @@ -1,713 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "DGEN_5.10past_Pref", "DGEN_5.10past_Pers", "DGEN_5.10past_Val", - "DGEN_5.10fut_Pref", "DGEN_5.10fut_Pers", "DGEN_5.10fut_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("\n=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print("Estimated Marginal Means:") -print(time_domain_emmeans) - -print("\nSimple Effects of DOMAIN within each TIME:") -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) - -print("\nSimple Effects of TIME within each DOMAIN:") -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("\n=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print("Estimated Marginal Means:") -print(time_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each TIME:") -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) - -print("\nSimple Effects of TIME within each INTERVAL:") -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("\n=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(domain_interval_emmeans) - -print("\nSimple Effects of INTERVAL within each DOMAIN:") -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) - -print("\nSimple Effects of DOMAIN within each INTERVAL:") -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006192716.r b/.history/eohi2/mixed anova - DGEN_20251006192716.r deleted file mode 100644 index 41f6a3a..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006192716.r +++ /dev/null @@ -1,698 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "DGEN_5.10past_Pref", "DGEN_5.10past_Pers", "DGEN_5.10past_Val", - "DGEN_5.10fut_Pref", "DGEN_5.10fut_Pers", "DGEN_5.10fut_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("\n=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print("Estimated Marginal Means:") -print(temporal_interval_do_emmeans) - -print("\nSimple Effects of interval_DO within each temporal_DO:") -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) - -print("\nSimple Effects of temporal_DO within each interval_DO:") -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("\n=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print("Estimated Marginal Means:") -print(three_way_emmeans) - -print("\nSimple Effects of TIME within each DOMAIN × INTERVAL combination:") -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("\n=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print("Estimated Marginal Means:") -print(temporal_time_emmeans) - -print("\nSimple Effects of TIME within each temporal_DO:") -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) - -print("\nSimple Effects of temporal_DO within each TIME:") -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006192735.r b/.history/eohi2/mixed anova - DGEN_20251006192735.r deleted file mode 100644 index 2d69e65..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006192735.r +++ /dev/null @@ -1,685 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "DGEN_5.10past_Pref", "DGEN_5.10past_Pers", "DGEN_5.10past_Val", - "DGEN_5.10fut_Pref", "DGEN_5.10fut_Pers", "DGEN_5.10fut_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print("Significant pairwise comparisons (p < 0.05):") - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006192745.r b/.history/eohi2/mixed anova - DGEN_20251006192745.r deleted file mode 100644 index e06ac4c..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006192745.r +++ /dev/null @@ -1,684 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "DGEN_5.10past_Pref", "DGEN_5.10past_Pers", "DGEN_5.10past_Val", - "DGEN_5.10fut_Pref", "DGEN_5.10fut_Pers", "DGEN_5.10fut_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print("Cohen's d results:") - print(cohens_d_results) - } - } else { - print("No significant pairwise comparisons found.") - } -} - -# Calculate Cohen's d for main effects -print("Cohen's d for significant main effects:") -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -print("\nCohen's d for significant two-way interactions:") -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -print("\nCohen's d for significant three-way interaction effects:") -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006192802.r b/.history/eohi2/mixed anova - DGEN_20251006192802.r deleted file mode 100644 index 249e41e..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006192802.r +++ /dev/null @@ -1,680 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "DGEN_5.10past_Pref", "DGEN_5.10past_Pers", "DGEN_5.10past_Val", - "DGEN_5.10fut_Pref", "DGEN_5.10fut_Pers", "DGEN_5.10fut_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print("Three-way interaction Cohen's d results:") - print(three_way_cohens_d) - } -} else { - print("No significant TIME effects found within any DOMAIN × INTERVAL combination.") -} - -print("=== ANALYSIS COMPLETE ===") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006192817.r b/.history/eohi2/mixed anova - DGEN_20251006192817.r deleted file mode 100644 index 43d0610..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006192817.r +++ /dev/null @@ -1,677 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "DGEN_5.10past_Pref", "DGEN_5.10past_Pers", "DGEN_5.10past_Val", - "DGEN_5.10fut_Pref", "DGEN_5.10fut_Pers", "DGEN_5.10fut_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006192843.r b/.history/eohi2/mixed anova - DGEN_20251006192843.r deleted file mode 100644 index a76ed71..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006192843.r +++ /dev/null @@ -1,751 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "DGEN_5.10past_Pref", "DGEN_5.10past_Pers", "DGEN_5.10past_Val", - "DGEN_5.10fut_Pref", "DGEN_5.10fut_Pers", "DGEN_5.10fut_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006192917.r b/.history/eohi2/mixed anova - DGEN_20251006192917.r deleted file mode 100644 index a76ed71..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006192917.r +++ /dev/null @@ -1,751 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "DGEN_5.10past_Pref", "DGEN_5.10past_Pers", "DGEN_5.10past_Val", - "DGEN_5.10fut_Pref", "DGEN_5.10fut_Pers", "DGEN_5.10fut_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006192940.r b/.history/eohi2/mixed anova - DGEN_20251006192940.r deleted file mode 100644 index a76ed71..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006192940.r +++ /dev/null @@ -1,751 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "DGEN_5.10past_Pref", "DGEN_5.10past_Pers", "DGEN_5.10past_Val", - "DGEN_5.10fut_Pref", "DGEN_5.10fut_Pers", "DGEN_5.10fut_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006193221.r b/.history/eohi2/mixed anova - DGEN_20251006193221.r deleted file mode 100644 index 5006e8f..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006193221.r +++ /dev/null @@ -1,747 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 4), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006193234.r b/.history/eohi2/mixed anova - DGEN_20251006193234.r deleted file mode 100644 index d71b975..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006193234.r +++ /dev/null @@ -1,747 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 4), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006193243.r b/.history/eohi2/mixed anova - DGEN_20251006193243.r deleted file mode 100644 index 0e107cd..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006193243.r +++ /dev/null @@ -1,747 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 4), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006193252.r b/.history/eohi2/mixed anova - DGEN_20251006193252.r deleted file mode 100644 index e780b11..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006193252.r +++ /dev/null @@ -1,747 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 4), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006193310.r b/.history/eohi2/mixed anova - DGEN_20251006193310.r deleted file mode 100644 index e780b11..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006193310.r +++ /dev/null @@ -1,747 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 4), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006193311.r b/.history/eohi2/mixed anova - DGEN_20251006193311.r deleted file mode 100644 index e780b11..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006193311.r +++ /dev/null @@ -1,747 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 4), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006193509.r b/.history/eohi2/mixed anova - DGEN_20251006193509.r deleted file mode 100644 index e780b11..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006193509.r +++ /dev/null @@ -1,747 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 4), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006193537.r b/.history/eohi2/mixed anova - DGEN_20251006193537.r deleted file mode 100644 index 94424de..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006193537.r +++ /dev/null @@ -1,771 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 4), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# EXPLICIT CONTRASTS FOR TIME × INTERVAL -print("=== EXPLICIT TIME × INTERVAL CONTRASTS ===") - -# 1. 5past vs 5future -print("1. Interval 5: Past vs Future") -contrast_5past_5future <- contrast(time_interval_emmeans, - list("5: Past vs Future" = c(1, 0, -1, 0)), - adjust = "none") -print(contrast_5past_5future) - -# 2. 10past vs 10future -print("2. Interval 10: Past vs Future") -contrast_10past_10future <- contrast(time_interval_emmeans, - list("10: Past vs Future" = c(0, 1, 0, -1)), - adjust = "none") -print(contrast_10past_10future) - -# 3. (5-10)past vs (5-10)future - tests if the interval effect differs by time -print("3. Interaction contrast: (5-10)Past vs (5-10)Future") -contrast_interaction <- contrast(time_interval_emmeans, - list("(5-10)Past vs (5-10)Future" = c(1, -1, -1, 1)), - adjust = "none") -print(contrast_interaction) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006193540.r b/.history/eohi2/mixed anova - DGEN_20251006193540.r deleted file mode 100644 index e780b11..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006193540.r +++ /dev/null @@ -1,747 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 6), rep("Future", 6)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 4), - INTERVAL = rep(c("5", "5", "5", "10", "10", "10"), 2), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006195204.r b/.history/eohi2/mixed anova - DGEN_20251006195204.r deleted file mode 100644 index afe67a5..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006195204.r +++ /dev/null @@ -1,751 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006195215.r b/.history/eohi2/mixed anova - DGEN_20251006195215.r deleted file mode 100644 index 6ccf702..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006195215.r +++ /dev/null @@ -1,751 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (2 levels: 5, 10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006195225.r b/.history/eohi2/mixed anova - DGEN_20251006195225.r deleted file mode 100644 index 98db425..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006195225.r +++ /dev/null @@ -1,751 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006195236.r b/.history/eohi2/mixed anova - DGEN_20251006195236.r deleted file mode 100644 index de27f62..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006195236.r +++ /dev/null @@ -1,751 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006195256.r b/.history/eohi2/mixed anova - DGEN_20251006195256.r deleted file mode 100644 index de27f62..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006195256.r +++ /dev/null @@ -1,751 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006195344.r b/.history/eohi2/mixed anova - DGEN_20251006195344.r deleted file mode 100644 index de27f62..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006195344.r +++ /dev/null @@ -1,751 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006195408.r b/.history/eohi2/mixed anova - DGEN_20251006195408.r deleted file mode 100644 index 4f657eb..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006195408.r +++ /dev/null @@ -1,755 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006195422.r b/.history/eohi2/mixed anova - DGEN_20251006195422.r deleted file mode 100644 index 4f657eb..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006195422.r +++ /dev/null @@ -1,755 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006195559.r b/.history/eohi2/mixed anova - DGEN_20251006195559.r deleted file mode 100644 index 4f657eb..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006195559.r +++ /dev/null @@ -1,755 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006200505.r b/.history/eohi2/mixed anova - DGEN_20251006200505.r deleted file mode 100644 index 9ff5786..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006200505.r +++ /dev/null @@ -1,755 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × DOMAIN (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × DOMAIN -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~DOMAIN, nrow = 1) + - labs( - x = "Order", - y = "DGEN Score", - title = "temporal_DO × TIME × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006200514.r b/.history/eohi2/mixed anova - DGEN_20251006200514.r deleted file mode 100644 index 9ff5786..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006200514.r +++ /dev/null @@ -1,755 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × DOMAIN (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × DOMAIN -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~DOMAIN, nrow = 1) + - labs( - x = "Order", - y = "DGEN Score", - title = "temporal_DO × TIME × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006200516.r b/.history/eohi2/mixed anova - DGEN_20251006200516.r deleted file mode 100644 index 9ff5786..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006200516.r +++ /dev/null @@ -1,755 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × DOMAIN (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × DOMAIN -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~DOMAIN, nrow = 1) + - labs( - x = "Order", - y = "DGEN Score", - title = "temporal_DO × TIME × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006201016.r b/.history/eohi2/mixed anova - DGEN_20251006201016.r deleted file mode 100644 index da25662..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006201016.r +++ /dev/null @@ -1,755 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × DOMAIN (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × DOMAIN -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~DOMAIN, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006201021.r b/.history/eohi2/mixed anova - DGEN_20251006201021.r deleted file mode 100644 index da25662..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006201021.r +++ /dev/null @@ -1,755 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × DOMAIN (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × DOMAIN -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~DOMAIN, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006201023.r b/.history/eohi2/mixed anova - DGEN_20251006201023.r deleted file mode 100644 index da25662..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006201023.r +++ /dev/null @@ -1,755 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × DOMAIN (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × DOMAIN -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~DOMAIN, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006201057.r b/.history/eohi2/mixed anova - DGEN_20251006201057.r deleted file mode 100644 index da25662..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006201057.r +++ /dev/null @@ -1,755 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × DOMAIN (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × DOMAIN -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~DOMAIN, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006230411.r b/.history/eohi2/mixed anova - DGEN_20251006230411.r deleted file mode 100644 index da25662..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006230411.r +++ /dev/null @@ -1,755 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × DOMAIN (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × DOMAIN -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~DOMAIN, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006231257.r b/.history/eohi2/mixed anova - DGEN_20251006231257.r deleted file mode 100644 index 1eba15c..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006231257.r +++ /dev/null @@ -1,775 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × DOMAIN (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × DOMAIN -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~DOMAIN, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006231308.r b/.history/eohi2/mixed anova - DGEN_20251006231308.r deleted file mode 100644 index af178eb..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006231308.r +++ /dev/null @@ -1,830 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for temporal_DO × TIME × INTERVAL interaction -print("=== COHEN'S D FOR temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_simple1_df <- as.data.frame(temporal_time_interval_simple1) -significant_temporal_time_interval <- temporal_time_interval_simple1_df[temporal_time_interval_simple1_df$p.value < 0.05, ] - -if(nrow(significant_temporal_time_interval) > 0) { - temporal_time_interval_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_temporal_time_interval))) { - comparison <- significant_temporal_time_interval[i, ] - - # Extract the grouping variables - temporal_level <- as.character(comparison$temporal_DO) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this temporal_DO × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - temporal_DO = temporal_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - temporal_time_interval_cohens_d <- rbind(temporal_time_interval_cohens_d, result_row) - } - } - - if(nrow(temporal_time_interval_cohens_d) > 0) { - print(temporal_time_interval_cohens_d) - } -} else { - cat("No significant TIME effects found within any temporal_DO × INTERVAL combination.\n") -} - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × DOMAIN (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × DOMAIN -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~DOMAIN, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006231314.r b/.history/eohi2/mixed anova - DGEN_20251006231314.r deleted file mode 100644 index af178eb..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006231314.r +++ /dev/null @@ -1,830 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for temporal_DO × TIME × INTERVAL interaction -print("=== COHEN'S D FOR temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_simple1_df <- as.data.frame(temporal_time_interval_simple1) -significant_temporal_time_interval <- temporal_time_interval_simple1_df[temporal_time_interval_simple1_df$p.value < 0.05, ] - -if(nrow(significant_temporal_time_interval) > 0) { - temporal_time_interval_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_temporal_time_interval))) { - comparison <- significant_temporal_time_interval[i, ] - - # Extract the grouping variables - temporal_level <- as.character(comparison$temporal_DO) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this temporal_DO × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - temporal_DO = temporal_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - temporal_time_interval_cohens_d <- rbind(temporal_time_interval_cohens_d, result_row) - } - } - - if(nrow(temporal_time_interval_cohens_d) > 0) { - print(temporal_time_interval_cohens_d) - } -} else { - cat("No significant TIME effects found within any temporal_DO × INTERVAL combination.\n") -} - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × DOMAIN (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × DOMAIN -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~DOMAIN, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006231325.r b/.history/eohi2/mixed anova - DGEN_20251006231325.r deleted file mode 100644 index af178eb..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006231325.r +++ /dev/null @@ -1,830 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for temporal_DO × TIME × INTERVAL interaction -print("=== COHEN'S D FOR temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_simple1_df <- as.data.frame(temporal_time_interval_simple1) -significant_temporal_time_interval <- temporal_time_interval_simple1_df[temporal_time_interval_simple1_df$p.value < 0.05, ] - -if(nrow(significant_temporal_time_interval) > 0) { - temporal_time_interval_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_temporal_time_interval))) { - comparison <- significant_temporal_time_interval[i, ] - - # Extract the grouping variables - temporal_level <- as.character(comparison$temporal_DO) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this temporal_DO × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - temporal_DO = temporal_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - temporal_time_interval_cohens_d <- rbind(temporal_time_interval_cohens_d, result_row) - } - } - - if(nrow(temporal_time_interval_cohens_d) > 0) { - print(temporal_time_interval_cohens_d) - } -} else { - cat("No significant TIME effects found within any temporal_DO × INTERVAL combination.\n") -} - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × DOMAIN (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × DOMAIN -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~DOMAIN, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006231533.r b/.history/eohi2/mixed anova - DGEN_20251006231533.r deleted file mode 100644 index 4b83e72..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006231533.r +++ /dev/null @@ -1,899 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for temporal_DO × TIME × INTERVAL interaction -print("=== COHEN'S D FOR temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_simple1_df <- as.data.frame(temporal_time_interval_simple1) -significant_temporal_time_interval <- temporal_time_interval_simple1_df[temporal_time_interval_simple1_df$p.value < 0.05, ] - -if(nrow(significant_temporal_time_interval) > 0) { - temporal_time_interval_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_temporal_time_interval))) { - comparison <- significant_temporal_time_interval[i, ] - - # Extract the grouping variables - temporal_level <- as.character(comparison$temporal_DO) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this temporal_DO × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - temporal_DO = temporal_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - temporal_time_interval_cohens_d <- rbind(temporal_time_interval_cohens_d, result_row) - } - } - - if(nrow(temporal_time_interval_cohens_d) > 0) { - print(temporal_time_interval_cohens_d) - } -} else { - cat("No significant TIME effects found within any temporal_DO × INTERVAL combination.\n") -} - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × DOMAIN (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × DOMAIN -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~DOMAIN, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (temporal_DO on x-axis, TIME in legend, INTERVAL as facets) -# ============================================================================= - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_temporal_time_interval_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame for the plot -emmeans_temporal_time_interval_plot <- emm_temporal_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create temporal_DO × TIME × INTERVAL interaction plot with facets -interaction_plot_temporal_time_interval <- ggplot(emmeans_temporal_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_temporal_time_interval) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006231538.r b/.history/eohi2/mixed anova - DGEN_20251006231538.r deleted file mode 100644 index 4b83e72..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006231538.r +++ /dev/null @@ -1,899 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for temporal_DO × TIME × INTERVAL interaction -print("=== COHEN'S D FOR temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_simple1_df <- as.data.frame(temporal_time_interval_simple1) -significant_temporal_time_interval <- temporal_time_interval_simple1_df[temporal_time_interval_simple1_df$p.value < 0.05, ] - -if(nrow(significant_temporal_time_interval) > 0) { - temporal_time_interval_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_temporal_time_interval))) { - comparison <- significant_temporal_time_interval[i, ] - - # Extract the grouping variables - temporal_level <- as.character(comparison$temporal_DO) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this temporal_DO × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - temporal_DO = temporal_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - temporal_time_interval_cohens_d <- rbind(temporal_time_interval_cohens_d, result_row) - } - } - - if(nrow(temporal_time_interval_cohens_d) > 0) { - print(temporal_time_interval_cohens_d) - } -} else { - cat("No significant TIME effects found within any temporal_DO × INTERVAL combination.\n") -} - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × DOMAIN (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × DOMAIN -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~DOMAIN, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (temporal_DO on x-axis, TIME in legend, INTERVAL as facets) -# ============================================================================= - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_temporal_time_interval_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame for the plot -emmeans_temporal_time_interval_plot <- emm_temporal_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create temporal_DO × TIME × INTERVAL interaction plot with facets -interaction_plot_temporal_time_interval <- ggplot(emmeans_temporal_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_temporal_time_interval) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006232531.r b/.history/eohi2/mixed anova - DGEN_20251006232531.r deleted file mode 100644 index 26bd1f4..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006232531.r +++ /dev/null @@ -1,1035 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for temporal_DO × TIME × INTERVAL interaction -print("=== COHEN'S D FOR temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_simple1_df <- as.data.frame(temporal_time_interval_simple1) -significant_temporal_time_interval <- temporal_time_interval_simple1_df[temporal_time_interval_simple1_df$p.value < 0.05, ] - -if(nrow(significant_temporal_time_interval) > 0) { - temporal_time_interval_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_temporal_time_interval))) { - comparison <- significant_temporal_time_interval[i, ] - - # Extract the grouping variables - temporal_level <- as.character(comparison$temporal_DO) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this temporal_DO × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - temporal_DO = temporal_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - temporal_time_interval_cohens_d <- rbind(temporal_time_interval_cohens_d, result_row) - } - } - - if(nrow(temporal_time_interval_cohens_d) > 0) { - print(temporal_time_interval_cohens_d) - } -} else { - cat("No significant TIME effects found within any temporal_DO × INTERVAL combination.\n") -} - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × DOMAIN (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × DOMAIN -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~DOMAIN, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (temporal_DO on x-axis, TIME in legend, INTERVAL as facets) -# ============================================================================= - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_temporal_time_interval_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame for the plot -emmeans_temporal_time_interval_plot <- emm_temporal_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create temporal_DO × TIME × INTERVAL interaction plot with facets -interaction_plot_temporal_time_interval <- ggplot(emmeans_temporal_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_temporal_time_interval) - -# ============================================================================= -# TWO-WAY INTERACTION PLOTS -# ============================================================================= - -# ============================================================================= -# PLOT 1: TIME × INTERVAL INTERACTION (INTERVAL on x-axis) -# ============================================================================= - -# Create emmeans for TIME × INTERVAL -emm_time_interval_plot <- emmeans(aov_model, ~ TIME * INTERVAL) - -# Prepare emmeans data frame for TIME × INTERVAL plot -emmeans_time_interval_plot <- emm_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(INTERVAL), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create TIME × INTERVAL interaction plot -interaction_plot_time_interval <- ggplot(emmeans_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Interval", - y = "Absolute difference from the present", - title = "TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2, 3), - labels = c("5", "10", "5_10"), - limits = c(0.5, 3.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_interval) - -# ============================================================================= -# PLOT 2: TIME × temporal_DO INTERACTION (temporal_DO on x-axis) -# ============================================================================= - -# Create emmeans for TIME × temporal_DO -emm_time_temporal_plot <- emmeans(aov_model, ~ TIME * temporal_DO) - -# Prepare emmeans data frame for TIME × temporal_DO plot -emmeans_time_temporal_plot <- emm_time_temporal_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create TIME × temporal_DO interaction plot -interaction_plot_time_temporal <- ggplot(emmeans_time_temporal_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "TIME × temporal_DO Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_temporal) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006232538.r b/.history/eohi2/mixed anova - DGEN_20251006232538.r deleted file mode 100644 index 26bd1f4..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006232538.r +++ /dev/null @@ -1,1035 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for temporal_DO × TIME × INTERVAL interaction -print("=== COHEN'S D FOR temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_simple1_df <- as.data.frame(temporal_time_interval_simple1) -significant_temporal_time_interval <- temporal_time_interval_simple1_df[temporal_time_interval_simple1_df$p.value < 0.05, ] - -if(nrow(significant_temporal_time_interval) > 0) { - temporal_time_interval_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_temporal_time_interval))) { - comparison <- significant_temporal_time_interval[i, ] - - # Extract the grouping variables - temporal_level <- as.character(comparison$temporal_DO) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this temporal_DO × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - temporal_DO = temporal_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - temporal_time_interval_cohens_d <- rbind(temporal_time_interval_cohens_d, result_row) - } - } - - if(nrow(temporal_time_interval_cohens_d) > 0) { - print(temporal_time_interval_cohens_d) - } -} else { - cat("No significant TIME effects found within any temporal_DO × INTERVAL combination.\n") -} - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × DOMAIN (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × DOMAIN -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~DOMAIN, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (temporal_DO on x-axis, TIME in legend, INTERVAL as facets) -# ============================================================================= - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_temporal_time_interval_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame for the plot -emmeans_temporal_time_interval_plot <- emm_temporal_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create temporal_DO × TIME × INTERVAL interaction plot with facets -interaction_plot_temporal_time_interval <- ggplot(emmeans_temporal_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_temporal_time_interval) - -# ============================================================================= -# TWO-WAY INTERACTION PLOTS -# ============================================================================= - -# ============================================================================= -# PLOT 1: TIME × INTERVAL INTERACTION (INTERVAL on x-axis) -# ============================================================================= - -# Create emmeans for TIME × INTERVAL -emm_time_interval_plot <- emmeans(aov_model, ~ TIME * INTERVAL) - -# Prepare emmeans data frame for TIME × INTERVAL plot -emmeans_time_interval_plot <- emm_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(INTERVAL), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create TIME × INTERVAL interaction plot -interaction_plot_time_interval <- ggplot(emmeans_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Interval", - y = "Absolute difference from the present", - title = "TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2, 3), - labels = c("5", "10", "5_10"), - limits = c(0.5, 3.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_interval) - -# ============================================================================= -# PLOT 2: TIME × temporal_DO INTERACTION (temporal_DO on x-axis) -# ============================================================================= - -# Create emmeans for TIME × temporal_DO -emm_time_temporal_plot <- emmeans(aov_model, ~ TIME * temporal_DO) - -# Prepare emmeans data frame for TIME × temporal_DO plot -emmeans_time_temporal_plot <- emm_time_temporal_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create TIME × temporal_DO interaction plot -interaction_plot_time_temporal <- ggplot(emmeans_time_temporal_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "TIME × temporal_DO Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_temporal) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251006232540.r b/.history/eohi2/mixed anova - DGEN_20251006232540.r deleted file mode 100644 index 26bd1f4..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251006232540.r +++ /dev/null @@ -1,1035 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for temporal_DO × TIME × INTERVAL interaction -print("=== COHEN'S D FOR temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_simple1_df <- as.data.frame(temporal_time_interval_simple1) -significant_temporal_time_interval <- temporal_time_interval_simple1_df[temporal_time_interval_simple1_df$p.value < 0.05, ] - -if(nrow(significant_temporal_time_interval) > 0) { - temporal_time_interval_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_temporal_time_interval))) { - comparison <- significant_temporal_time_interval[i, ] - - # Extract the grouping variables - temporal_level <- as.character(comparison$temporal_DO) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this temporal_DO × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - temporal_DO = temporal_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - temporal_time_interval_cohens_d <- rbind(temporal_time_interval_cohens_d, result_row) - } - } - - if(nrow(temporal_time_interval_cohens_d) > 0) { - print(temporal_time_interval_cohens_d) - } -} else { - cat("No significant TIME effects found within any temporal_DO × INTERVAL combination.\n") -} - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × DOMAIN (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × DOMAIN -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~DOMAIN, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (temporal_DO on x-axis, TIME in legend, INTERVAL as facets) -# ============================================================================= - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_temporal_time_interval_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame for the plot -emmeans_temporal_time_interval_plot <- emm_temporal_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create temporal_DO × TIME × INTERVAL interaction plot with facets -interaction_plot_temporal_time_interval <- ggplot(emmeans_temporal_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_temporal_time_interval) - -# ============================================================================= -# TWO-WAY INTERACTION PLOTS -# ============================================================================= - -# ============================================================================= -# PLOT 1: TIME × INTERVAL INTERACTION (INTERVAL on x-axis) -# ============================================================================= - -# Create emmeans for TIME × INTERVAL -emm_time_interval_plot <- emmeans(aov_model, ~ TIME * INTERVAL) - -# Prepare emmeans data frame for TIME × INTERVAL plot -emmeans_time_interval_plot <- emm_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(INTERVAL), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create TIME × INTERVAL interaction plot -interaction_plot_time_interval <- ggplot(emmeans_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Interval", - y = "Absolute difference from the present", - title = "TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2, 3), - labels = c("5", "10", "5_10"), - limits = c(0.5, 3.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_interval) - -# ============================================================================= -# PLOT 2: TIME × temporal_DO INTERACTION (temporal_DO on x-axis) -# ============================================================================= - -# Create emmeans for TIME × temporal_DO -emm_time_temporal_plot <- emmeans(aov_model, ~ TIME * temporal_DO) - -# Prepare emmeans data frame for TIME × temporal_DO plot -emmeans_time_temporal_plot <- emm_time_temporal_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create TIME × temporal_DO interaction plot -interaction_plot_time_temporal <- ggplot(emmeans_time_temporal_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "TIME × temporal_DO Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_temporal) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251007103206.r b/.history/eohi2/mixed anova - DGEN_20251007103206.r deleted file mode 100644 index 6d3ddcd..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251007103206.r +++ /dev/null @@ -1,1093 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") -cat("anova - dgen", getwd(), "\n") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for temporal_DO × TIME × INTERVAL interaction -print("=== COHEN'S D FOR temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_simple1_df <- as.data.frame(temporal_time_interval_simple1) -significant_temporal_time_interval <- temporal_time_interval_simple1_df[temporal_time_interval_simple1_df$p.value < 0.05, ] - -if(nrow(significant_temporal_time_interval) > 0) { - temporal_time_interval_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_temporal_time_interval))) { - comparison <- significant_temporal_time_interval[i, ] - - # Extract the grouping variables - temporal_level <- as.character(comparison$temporal_DO) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this temporal_DO × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - temporal_DO = temporal_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - temporal_time_interval_cohens_d <- rbind(temporal_time_interval_cohens_d, result_row) - } - } - - if(nrow(temporal_time_interval_cohens_d) > 0) { - print(temporal_time_interval_cohens_d) - } -} else { - cat("No significant TIME effects found within any temporal_DO × INTERVAL combination.\n") -} - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × DOMAIN (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × DOMAIN -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~DOMAIN, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (temporal_DO on x-axis, TIME in legend, INTERVAL as facets) -# ============================================================================= - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_temporal_time_interval_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame for the plot -emmeans_temporal_time_interval_plot <- emm_temporal_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create temporal_DO × TIME × INTERVAL interaction plot with facets -interaction_plot_temporal_time_interval <- ggplot(emmeans_temporal_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_temporal_time_interval) - -# ============================================================================= -# TWO-WAY INTERACTION PLOTS -# ============================================================================= - -# ============================================================================= -# PLOT 1: TIME × INTERVAL INTERACTION (INTERVAL on x-axis) -# ============================================================================= - -# Create emmeans for TIME × INTERVAL -emm_time_interval_plot <- emmeans(aov_model, ~ TIME * INTERVAL) - -# Prepare emmeans data frame for TIME × INTERVAL plot -emmeans_time_interval_plot <- emm_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(INTERVAL), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create TIME × INTERVAL interaction plot -interaction_plot_time_interval <- ggplot(emmeans_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Interval", - y = "Absolute difference from the present", - title = "TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2, 3), - labels = c("5", "10", "5_10"), - limits = c(0.5, 3.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_interval) - -# ============================================================================= -# PLOT 2: TIME × temporal_DO INTERACTION (temporal_DO on x-axis) -# ============================================================================= - -# Create emmeans for TIME × temporal_DO -emm_time_temporal_plot <- emmeans(aov_model, ~ TIME * temporal_DO) - -# Prepare emmeans data frame for TIME × temporal_DO plot -emmeans_time_temporal_plot <- emm_time_temporal_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create TIME × temporal_DO interaction plot -interaction_plot_time_temporal <- ggplot(emmeans_time_temporal_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "TIME × temporal_DO Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_temporal) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Estimated Marginal Means only) -# ============================================================================= - -# Create emmeans for TIME main effect -emm_time_main <- emmeans(aov_model, ~ TIME) - -# Prepare emmeans data frame -emmeans_time_main <- emm_time_main %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Plot TIME main effect (only emmeans + error bars) -plot_time_main_effect <- ggplot(emmeans_time_main) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.9 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Temporal Direction", - y = "Absolute difference from the present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(plot_time_main_effect) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251007103213.r b/.history/eohi2/mixed anova - DGEN_20251007103213.r deleted file mode 100644 index 6d3ddcd..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251007103213.r +++ /dev/null @@ -1,1093 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") -cat("anova - dgen", getwd(), "\n") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for temporal_DO × TIME × INTERVAL interaction -print("=== COHEN'S D FOR temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_simple1_df <- as.data.frame(temporal_time_interval_simple1) -significant_temporal_time_interval <- temporal_time_interval_simple1_df[temporal_time_interval_simple1_df$p.value < 0.05, ] - -if(nrow(significant_temporal_time_interval) > 0) { - temporal_time_interval_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_temporal_time_interval))) { - comparison <- significant_temporal_time_interval[i, ] - - # Extract the grouping variables - temporal_level <- as.character(comparison$temporal_DO) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this temporal_DO × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - temporal_DO = temporal_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - temporal_time_interval_cohens_d <- rbind(temporal_time_interval_cohens_d, result_row) - } - } - - if(nrow(temporal_time_interval_cohens_d) > 0) { - print(temporal_time_interval_cohens_d) - } -} else { - cat("No significant TIME effects found within any temporal_DO × INTERVAL combination.\n") -} - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × DOMAIN (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × DOMAIN -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~DOMAIN, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (temporal_DO on x-axis, TIME in legend, INTERVAL as facets) -# ============================================================================= - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_temporal_time_interval_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame for the plot -emmeans_temporal_time_interval_plot <- emm_temporal_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create temporal_DO × TIME × INTERVAL interaction plot with facets -interaction_plot_temporal_time_interval <- ggplot(emmeans_temporal_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_temporal_time_interval) - -# ============================================================================= -# TWO-WAY INTERACTION PLOTS -# ============================================================================= - -# ============================================================================= -# PLOT 1: TIME × INTERVAL INTERACTION (INTERVAL on x-axis) -# ============================================================================= - -# Create emmeans for TIME × INTERVAL -emm_time_interval_plot <- emmeans(aov_model, ~ TIME * INTERVAL) - -# Prepare emmeans data frame for TIME × INTERVAL plot -emmeans_time_interval_plot <- emm_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(INTERVAL), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create TIME × INTERVAL interaction plot -interaction_plot_time_interval <- ggplot(emmeans_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Interval", - y = "Absolute difference from the present", - title = "TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2, 3), - labels = c("5", "10", "5_10"), - limits = c(0.5, 3.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_interval) - -# ============================================================================= -# PLOT 2: TIME × temporal_DO INTERACTION (temporal_DO on x-axis) -# ============================================================================= - -# Create emmeans for TIME × temporal_DO -emm_time_temporal_plot <- emmeans(aov_model, ~ TIME * temporal_DO) - -# Prepare emmeans data frame for TIME × temporal_DO plot -emmeans_time_temporal_plot <- emm_time_temporal_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create TIME × temporal_DO interaction plot -interaction_plot_time_temporal <- ggplot(emmeans_time_temporal_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "TIME × temporal_DO Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_temporal) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Estimated Marginal Means only) -# ============================================================================= - -# Create emmeans for TIME main effect -emm_time_main <- emmeans(aov_model, ~ TIME) - -# Prepare emmeans data frame -emmeans_time_main <- emm_time_main %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Plot TIME main effect (only emmeans + error bars) -plot_time_main_effect <- ggplot(emmeans_time_main) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.9 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Temporal Direction", - y = "Absolute difference from the present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(plot_time_main_effect) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251007103739.r b/.history/eohi2/mixed anova - DGEN_20251007103739.r deleted file mode 100644 index 6d3ddcd..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251007103739.r +++ /dev/null @@ -1,1093 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") -cat("anova - dgen", getwd(), "\n") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for temporal_DO × TIME × INTERVAL interaction -print("=== COHEN'S D FOR temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_simple1_df <- as.data.frame(temporal_time_interval_simple1) -significant_temporal_time_interval <- temporal_time_interval_simple1_df[temporal_time_interval_simple1_df$p.value < 0.05, ] - -if(nrow(significant_temporal_time_interval) > 0) { - temporal_time_interval_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_temporal_time_interval))) { - comparison <- significant_temporal_time_interval[i, ] - - # Extract the grouping variables - temporal_level <- as.character(comparison$temporal_DO) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this temporal_DO × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - temporal_DO = temporal_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - temporal_time_interval_cohens_d <- rbind(temporal_time_interval_cohens_d, result_row) - } - } - - if(nrow(temporal_time_interval_cohens_d) > 0) { - print(temporal_time_interval_cohens_d) - } -} else { - cat("No significant TIME effects found within any temporal_DO × INTERVAL combination.\n") -} - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × DOMAIN (Emmeans only) -# ============================================================================= - -print("=== INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for temporal_DO × TIME × DOMAIN -emm_3way_plot <- emmeans(aov_model, ~ temporal_DO * TIME * DOMAIN) - -# Prepare emmeans data frame -emmeans_3way_plot <- emm_3way_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~DOMAIN, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × DOMAIN Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (temporal_DO on x-axis, TIME in legend, INTERVAL as facets) -# ============================================================================= - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_temporal_time_interval_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame for the plot -emmeans_temporal_time_interval_plot <- emm_temporal_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create temporal_DO × TIME × INTERVAL interaction plot with facets -interaction_plot_temporal_time_interval <- ggplot(emmeans_temporal_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_temporal_time_interval) - -# ============================================================================= -# TWO-WAY INTERACTION PLOTS -# ============================================================================= - -# ============================================================================= -# PLOT 1: TIME × INTERVAL INTERACTION (INTERVAL on x-axis) -# ============================================================================= - -# Create emmeans for TIME × INTERVAL -emm_time_interval_plot <- emmeans(aov_model, ~ TIME * INTERVAL) - -# Prepare emmeans data frame for TIME × INTERVAL plot -emmeans_time_interval_plot <- emm_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(INTERVAL), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create TIME × INTERVAL interaction plot -interaction_plot_time_interval <- ggplot(emmeans_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Interval", - y = "Absolute difference from the present", - title = "TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2, 3), - labels = c("5", "10", "5_10"), - limits = c(0.5, 3.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_interval) - -# ============================================================================= -# PLOT 2: TIME × temporal_DO INTERACTION (temporal_DO on x-axis) -# ============================================================================= - -# Create emmeans for TIME × temporal_DO -emm_time_temporal_plot <- emmeans(aov_model, ~ TIME * temporal_DO) - -# Prepare emmeans data frame for TIME × temporal_DO plot -emmeans_time_temporal_plot <- emm_time_temporal_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create TIME × temporal_DO interaction plot -interaction_plot_time_temporal <- ggplot(emmeans_time_temporal_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "TIME × temporal_DO Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(interaction_plot_time_temporal) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Estimated Marginal Means only) -# ============================================================================= - -# Create emmeans for TIME main effect -emm_time_main <- emmeans(aov_model, ~ TIME) - -# Prepare emmeans data frame -emmeans_time_main <- emm_time_main %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Plot TIME main effect (only emmeans + error bars) -plot_time_main_effect <- ggplot(emmeans_time_main) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.9 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Temporal Direction", - y = "Absolute difference from the present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(plot_time_main_effect) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251007104105.r b/.history/eohi2/mixed anova - DGEN_20251007104105.r deleted file mode 100644 index 5bbbeda..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251007104105.r +++ /dev/null @@ -1,892 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") -cat("anova - dgen", getwd(), "\n") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for temporal_DO × TIME × INTERVAL interaction -print("=== COHEN'S D FOR temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_simple1_df <- as.data.frame(temporal_time_interval_simple1) -significant_temporal_time_interval <- temporal_time_interval_simple1_df[temporal_time_interval_simple1_df$p.value < 0.05, ] - -if(nrow(significant_temporal_time_interval) > 0) { - temporal_time_interval_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_temporal_time_interval))) { - comparison <- significant_temporal_time_interval[i, ] - - # Extract the grouping variables - temporal_level <- as.character(comparison$temporal_DO) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this temporal_DO × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - temporal_DO = temporal_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - temporal_time_interval_cohens_d <- rbind(temporal_time_interval_cohens_d, result_row) - } - } - - if(nrow(temporal_time_interval_cohens_d) > 0) { - print(temporal_time_interval_cohens_d) - } -} else { - cat("No significant TIME effects found within any temporal_DO × INTERVAL combination.\n") -} - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# COLOR PALETTE FOR PLOTS -# ============================================================================= - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (temporal_DO on x-axis, TIME in legend, INTERVAL as facets) -# ============================================================================= - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_temporal_time_interval_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame for the plot -emmeans_temporal_time_interval_plot <- emm_temporal_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create temporal_DO × TIME × INTERVAL interaction plot with facets -interaction_plot_temporal_time_interval <- ggplot(emmeans_temporal_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_temporal_time_interval) - -# (Two-way interaction plots removed as requested) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Estimated Marginal Means only) -# ============================================================================= - -# Create emmeans for TIME main effect -emm_time_main <- emmeans(aov_model, ~ TIME) - -# Prepare emmeans data frame -emmeans_time_main <- emm_time_main %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Plot TIME main effect (only emmeans + error bars) -plot_time_main_effect <- ggplot(emmeans_time_main) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.9 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Temporal Direction", - y = "Absolute difference from the present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(plot_time_main_effect) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251007104111.r b/.history/eohi2/mixed anova - DGEN_20251007104111.r deleted file mode 100644 index 5bbbeda..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251007104111.r +++ /dev/null @@ -1,892 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") -cat("anova - dgen", getwd(), "\n") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for temporal_DO × TIME × INTERVAL interaction -print("=== COHEN'S D FOR temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_simple1_df <- as.data.frame(temporal_time_interval_simple1) -significant_temporal_time_interval <- temporal_time_interval_simple1_df[temporal_time_interval_simple1_df$p.value < 0.05, ] - -if(nrow(significant_temporal_time_interval) > 0) { - temporal_time_interval_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_temporal_time_interval))) { - comparison <- significant_temporal_time_interval[i, ] - - # Extract the grouping variables - temporal_level <- as.character(comparison$temporal_DO) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this temporal_DO × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - temporal_DO = temporal_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - temporal_time_interval_cohens_d <- rbind(temporal_time_interval_cohens_d, result_row) - } - } - - if(nrow(temporal_time_interval_cohens_d) > 0) { - print(temporal_time_interval_cohens_d) - } -} else { - cat("No significant TIME effects found within any temporal_DO × INTERVAL combination.\n") -} - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# COLOR PALETTE FOR PLOTS -# ============================================================================= - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (temporal_DO on x-axis, TIME in legend, INTERVAL as facets) -# ============================================================================= - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_temporal_time_interval_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame for the plot -emmeans_temporal_time_interval_plot <- emm_temporal_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create temporal_DO × TIME × INTERVAL interaction plot with facets -interaction_plot_temporal_time_interval <- ggplot(emmeans_temporal_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_temporal_time_interval) - -# (Two-way interaction plots removed as requested) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Estimated Marginal Means only) -# ============================================================================= - -# Create emmeans for TIME main effect -emm_time_main <- emmeans(aov_model, ~ TIME) - -# Prepare emmeans data frame -emmeans_time_main <- emm_time_main %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Plot TIME main effect (only emmeans + error bars) -plot_time_main_effect <- ggplot(emmeans_time_main) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.9 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Temporal Direction", - y = "Absolute difference from the present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(plot_time_main_effect) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251007104119.r b/.history/eohi2/mixed anova - DGEN_20251007104119.r deleted file mode 100644 index 5bbbeda..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251007104119.r +++ /dev/null @@ -1,892 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") -cat("anova - dgen", getwd(), "\n") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for temporal_DO × TIME × INTERVAL interaction -print("=== COHEN'S D FOR temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_simple1_df <- as.data.frame(temporal_time_interval_simple1) -significant_temporal_time_interval <- temporal_time_interval_simple1_df[temporal_time_interval_simple1_df$p.value < 0.05, ] - -if(nrow(significant_temporal_time_interval) > 0) { - temporal_time_interval_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_temporal_time_interval))) { - comparison <- significant_temporal_time_interval[i, ] - - # Extract the grouping variables - temporal_level <- as.character(comparison$temporal_DO) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this temporal_DO × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - temporal_DO = temporal_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - temporal_time_interval_cohens_d <- rbind(temporal_time_interval_cohens_d, result_row) - } - } - - if(nrow(temporal_time_interval_cohens_d) > 0) { - print(temporal_time_interval_cohens_d) - } -} else { - cat("No significant TIME effects found within any temporal_DO × INTERVAL combination.\n") -} - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# COLOR PALETTE FOR PLOTS -# ============================================================================= - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (temporal_DO on x-axis, TIME in legend, INTERVAL as facets) -# ============================================================================= - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_temporal_time_interval_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame for the plot -emmeans_temporal_time_interval_plot <- emm_temporal_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create temporal_DO × TIME × INTERVAL interaction plot with facets -interaction_plot_temporal_time_interval <- ggplot(emmeans_temporal_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_temporal_time_interval) - -# (Two-way interaction plots removed as requested) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Estimated Marginal Means only) -# ============================================================================= - -# Create emmeans for TIME main effect -emm_time_main <- emmeans(aov_model, ~ TIME) - -# Prepare emmeans data frame -emmeans_time_main <- emm_time_main %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Plot TIME main effect (only emmeans + error bars) -plot_time_main_effect <- ggplot(emmeans_time_main) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.9 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Temporal Direction", - y = "Absolute difference from the present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(plot_time_main_effect) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251007105736.r b/.history/eohi2/mixed anova - DGEN_20251007105736.r deleted file mode 100644 index 5bbbeda..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251007105736.r +++ /dev/null @@ -1,892 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") -cat("anova - dgen", getwd(), "\n") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for temporal_DO × TIME × INTERVAL interaction -print("=== COHEN'S D FOR temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_simple1_df <- as.data.frame(temporal_time_interval_simple1) -significant_temporal_time_interval <- temporal_time_interval_simple1_df[temporal_time_interval_simple1_df$p.value < 0.05, ] - -if(nrow(significant_temporal_time_interval) > 0) { - temporal_time_interval_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_temporal_time_interval))) { - comparison <- significant_temporal_time_interval[i, ] - - # Extract the grouping variables - temporal_level <- as.character(comparison$temporal_DO) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this temporal_DO × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - temporal_DO = temporal_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - temporal_time_interval_cohens_d <- rbind(temporal_time_interval_cohens_d, result_row) - } - } - - if(nrow(temporal_time_interval_cohens_d) > 0) { - print(temporal_time_interval_cohens_d) - } -} else { - cat("No significant TIME effects found within any temporal_DO × INTERVAL combination.\n") -} - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# COLOR PALETTE FOR PLOTS -# ============================================================================= - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (temporal_DO on x-axis, TIME in legend, INTERVAL as facets) -# ============================================================================= - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_temporal_time_interval_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame for the plot -emmeans_temporal_time_interval_plot <- emm_temporal_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create temporal_DO × TIME × INTERVAL interaction plot with facets -interaction_plot_temporal_time_interval <- ggplot(emmeans_temporal_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_temporal_time_interval) - -# (Two-way interaction plots removed as requested) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Estimated Marginal Means only) -# ============================================================================= - -# Create emmeans for TIME main effect -emm_time_main <- emmeans(aov_model, ~ TIME) - -# Prepare emmeans data frame -emmeans_time_main <- emm_time_main %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Plot TIME main effect (only emmeans + error bars) -plot_time_main_effect <- ggplot(emmeans_time_main) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.9 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Temporal Direction", - y = "Absolute difference from the present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(plot_time_main_effect) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251007185029.r b/.history/eohi2/mixed anova - DGEN_20251007185029.r deleted file mode 100644 index 5bbbeda..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251007185029.r +++ /dev/null @@ -1,892 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") -cat("anova - dgen", getwd(), "\n") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for temporal_DO × TIME × INTERVAL interaction -print("=== COHEN'S D FOR temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_simple1_df <- as.data.frame(temporal_time_interval_simple1) -significant_temporal_time_interval <- temporal_time_interval_simple1_df[temporal_time_interval_simple1_df$p.value < 0.05, ] - -if(nrow(significant_temporal_time_interval) > 0) { - temporal_time_interval_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_temporal_time_interval))) { - comparison <- significant_temporal_time_interval[i, ] - - # Extract the grouping variables - temporal_level <- as.character(comparison$temporal_DO) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this temporal_DO × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - temporal_DO = temporal_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - temporal_time_interval_cohens_d <- rbind(temporal_time_interval_cohens_d, result_row) - } - } - - if(nrow(temporal_time_interval_cohens_d) > 0) { - print(temporal_time_interval_cohens_d) - } -} else { - cat("No significant TIME effects found within any temporal_DO × INTERVAL combination.\n") -} - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# COLOR PALETTE FOR PLOTS -# ============================================================================= - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (temporal_DO on x-axis, TIME in legend, INTERVAL as facets) -# ============================================================================= - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_temporal_time_interval_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame for the plot -emmeans_temporal_time_interval_plot <- emm_temporal_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create temporal_DO × TIME × INTERVAL interaction plot with facets -interaction_plot_temporal_time_interval <- ggplot(emmeans_temporal_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_temporal_time_interval) - -# (Two-way interaction plots removed as requested) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Estimated Marginal Means only) -# ============================================================================= - -# Create emmeans for TIME main effect -emm_time_main <- emmeans(aov_model, ~ TIME) - -# Prepare emmeans data frame -emmeans_time_main <- emm_time_main %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Plot TIME main effect (only emmeans + error bars) -plot_time_main_effect <- ggplot(emmeans_time_main) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.9 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Temporal Direction", - y = "Absolute difference from the present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(plot_time_main_effect) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251007185541.r b/.history/eohi2/mixed anova - DGEN_20251007185541.r deleted file mode 100644 index 5bbbeda..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251007185541.r +++ /dev/null @@ -1,892 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") -cat("anova - dgen", getwd(), "\n") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for temporal_DO × TIME × INTERVAL interaction -print("=== COHEN'S D FOR temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_simple1_df <- as.data.frame(temporal_time_interval_simple1) -significant_temporal_time_interval <- temporal_time_interval_simple1_df[temporal_time_interval_simple1_df$p.value < 0.05, ] - -if(nrow(significant_temporal_time_interval) > 0) { - temporal_time_interval_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_temporal_time_interval))) { - comparison <- significant_temporal_time_interval[i, ] - - # Extract the grouping variables - temporal_level <- as.character(comparison$temporal_DO) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this temporal_DO × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - temporal_DO = temporal_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - temporal_time_interval_cohens_d <- rbind(temporal_time_interval_cohens_d, result_row) - } - } - - if(nrow(temporal_time_interval_cohens_d) > 0) { - print(temporal_time_interval_cohens_d) - } -} else { - cat("No significant TIME effects found within any temporal_DO × INTERVAL combination.\n") -} - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# COLOR PALETTE FOR PLOTS -# ============================================================================= - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (temporal_DO on x-axis, TIME in legend, INTERVAL as facets) -# ============================================================================= - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_temporal_time_interval_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame for the plot -emmeans_temporal_time_interval_plot <- emm_temporal_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create temporal_DO × TIME × INTERVAL interaction plot with facets -interaction_plot_temporal_time_interval <- ggplot(emmeans_temporal_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_temporal_time_interval) - -# (Two-way interaction plots removed as requested) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Estimated Marginal Means only) -# ============================================================================= - -# Create emmeans for TIME main effect -emm_time_main <- emmeans(aov_model, ~ TIME) - -# Prepare emmeans data frame -emmeans_time_main <- emm_time_main %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Plot TIME main effect (only emmeans + error bars) -plot_time_main_effect <- ggplot(emmeans_time_main) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.9 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Temporal Direction", - y = "Absolute difference from the present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(plot_time_main_effect) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251007192720.r b/.history/eohi2/mixed anova - DGEN_20251007192720.r deleted file mode 100644 index 5bbbeda..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251007192720.r +++ /dev/null @@ -1,892 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") -cat("anova - dgen", getwd(), "\n") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for temporal_DO × TIME × INTERVAL interaction -print("=== COHEN'S D FOR temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_simple1_df <- as.data.frame(temporal_time_interval_simple1) -significant_temporal_time_interval <- temporal_time_interval_simple1_df[temporal_time_interval_simple1_df$p.value < 0.05, ] - -if(nrow(significant_temporal_time_interval) > 0) { - temporal_time_interval_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_temporal_time_interval))) { - comparison <- significant_temporal_time_interval[i, ] - - # Extract the grouping variables - temporal_level <- as.character(comparison$temporal_DO) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this temporal_DO × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - temporal_DO = temporal_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - temporal_time_interval_cohens_d <- rbind(temporal_time_interval_cohens_d, result_row) - } - } - - if(nrow(temporal_time_interval_cohens_d) > 0) { - print(temporal_time_interval_cohens_d) - } -} else { - cat("No significant TIME effects found within any temporal_DO × INTERVAL combination.\n") -} - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# COLOR PALETTE FOR PLOTS -# ============================================================================= - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (temporal_DO on x-axis, TIME in legend, INTERVAL as facets) -# ============================================================================= - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_temporal_time_interval_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame for the plot -emmeans_temporal_time_interval_plot <- emm_temporal_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create temporal_DO × TIME × INTERVAL interaction plot with facets -interaction_plot_temporal_time_interval <- ggplot(emmeans_temporal_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_temporal_time_interval) - -# (Two-way interaction plots removed as requested) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Estimated Marginal Means only) -# ============================================================================= - -# Create emmeans for TIME main effect -emm_time_main <- emmeans(aov_model, ~ TIME) - -# Prepare emmeans data frame -emmeans_time_main <- emm_time_main %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Plot TIME main effect (only emmeans + error bars) -plot_time_main_effect <- ggplot(emmeans_time_main) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.9 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Temporal Direction", - y = "Absolute difference from the present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(plot_time_main_effect) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251008190007.r b/.history/eohi2/mixed anova - DGEN_20251008190007.r deleted file mode 100644 index 5bbbeda..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251008190007.r +++ /dev/null @@ -1,892 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") -cat("anova - dgen", getwd(), "\n") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for temporal_DO × TIME × INTERVAL interaction -print("=== COHEN'S D FOR temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_simple1_df <- as.data.frame(temporal_time_interval_simple1) -significant_temporal_time_interval <- temporal_time_interval_simple1_df[temporal_time_interval_simple1_df$p.value < 0.05, ] - -if(nrow(significant_temporal_time_interval) > 0) { - temporal_time_interval_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_temporal_time_interval))) { - comparison <- significant_temporal_time_interval[i, ] - - # Extract the grouping variables - temporal_level <- as.character(comparison$temporal_DO) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this temporal_DO × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - temporal_DO = temporal_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - temporal_time_interval_cohens_d <- rbind(temporal_time_interval_cohens_d, result_row) - } - } - - if(nrow(temporal_time_interval_cohens_d) > 0) { - print(temporal_time_interval_cohens_d) - } -} else { - cat("No significant TIME effects found within any temporal_DO × INTERVAL combination.\n") -} - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# COLOR PALETTE FOR PLOTS -# ============================================================================= - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (temporal_DO on x-axis, TIME in legend, INTERVAL as facets) -# ============================================================================= - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_temporal_time_interval_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame for the plot -emmeans_temporal_time_interval_plot <- emm_temporal_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create temporal_DO × TIME × INTERVAL interaction plot with facets -interaction_plot_temporal_time_interval <- ggplot(emmeans_temporal_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Absolute difference from the present", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_temporal_time_interval) - -# (Two-way interaction plots removed as requested) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Estimated Marginal Means only) -# ============================================================================= - -# Create emmeans for TIME main effect -emm_time_main <- emmeans(aov_model, ~ TIME) - -# Prepare emmeans data frame -emmeans_time_main <- emm_time_main %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Plot TIME main effect (only emmeans + error bars) -plot_time_main_effect <- ggplot(emmeans_time_main) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.9 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Temporal Direction", - y = "Absolute difference from the present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(plot_time_main_effect) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251008190301.r b/.history/eohi2/mixed anova - DGEN_20251008190301.r deleted file mode 100644 index 4b40414..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251008190301.r +++ /dev/null @@ -1,896 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") -cat("anova - dgen", getwd(), "\n") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for temporal_DO × TIME × INTERVAL interaction -print("=== COHEN'S D FOR temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_simple1_df <- as.data.frame(temporal_time_interval_simple1) -significant_temporal_time_interval <- temporal_time_interval_simple1_df[temporal_time_interval_simple1_df$p.value < 0.05, ] - -if(nrow(significant_temporal_time_interval) > 0) { - temporal_time_interval_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_temporal_time_interval))) { - comparison <- significant_temporal_time_interval[i, ] - - # Extract the grouping variables - temporal_level <- as.character(comparison$temporal_DO) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this temporal_DO × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - temporal_DO = temporal_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - temporal_time_interval_cohens_d <- rbind(temporal_time_interval_cohens_d, result_row) - } - } - - if(nrow(temporal_time_interval_cohens_d) > 0) { - print(temporal_time_interval_cohens_d) - } -} else { - cat("No significant TIME effects found within any temporal_DO × INTERVAL combination.\n") -} - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# COLOR PALETTE FOR PLOTS -# ============================================================================= - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (temporal_DO on x-axis, TIME in legend, INTERVAL as facets) -# ============================================================================= - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_temporal_time_interval_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame for the plot -emmeans_temporal_time_interval_plot <- emm_temporal_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create temporal_DO × TIME × INTERVAL interaction plot with facets -interaction_plot_temporal_time_interval <- ggplot(emmeans_temporal_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1, labeller = labeller(INTERVAL = c( - "5" = "Present v. 5 Years", - "10" = "Present v. 10 Years", - "5_10" = "5 Years v. 10 Years" - ))) + - labs( - x = "Order", - y = "Mean absolute deviation", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_temporal_time_interval) - -# (Two-way interaction plots removed as requested) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Estimated Marginal Means only) -# ============================================================================= - -# Create emmeans for TIME main effect -emm_time_main <- emmeans(aov_model, ~ TIME) - -# Prepare emmeans data frame -emmeans_time_main <- emm_time_main %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Plot TIME main effect (only emmeans + error bars) -plot_time_main_effect <- ggplot(emmeans_time_main) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.9 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Temporal Direction", - y = "Absolute difference from the present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(plot_time_main_effect) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251008190307.r b/.history/eohi2/mixed anova - DGEN_20251008190307.r deleted file mode 100644 index 4b40414..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251008190307.r +++ /dev/null @@ -1,896 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") -cat("anova - dgen", getwd(), "\n") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for temporal_DO × TIME × INTERVAL interaction -print("=== COHEN'S D FOR temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_simple1_df <- as.data.frame(temporal_time_interval_simple1) -significant_temporal_time_interval <- temporal_time_interval_simple1_df[temporal_time_interval_simple1_df$p.value < 0.05, ] - -if(nrow(significant_temporal_time_interval) > 0) { - temporal_time_interval_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_temporal_time_interval))) { - comparison <- significant_temporal_time_interval[i, ] - - # Extract the grouping variables - temporal_level <- as.character(comparison$temporal_DO) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this temporal_DO × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - temporal_DO = temporal_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - temporal_time_interval_cohens_d <- rbind(temporal_time_interval_cohens_d, result_row) - } - } - - if(nrow(temporal_time_interval_cohens_d) > 0) { - print(temporal_time_interval_cohens_d) - } -} else { - cat("No significant TIME effects found within any temporal_DO × INTERVAL combination.\n") -} - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# COLOR PALETTE FOR PLOTS -# ============================================================================= - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (temporal_DO on x-axis, TIME in legend, INTERVAL as facets) -# ============================================================================= - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_temporal_time_interval_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame for the plot -emmeans_temporal_time_interval_plot <- emm_temporal_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create temporal_DO × TIME × INTERVAL interaction plot with facets -interaction_plot_temporal_time_interval <- ggplot(emmeans_temporal_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1, labeller = labeller(INTERVAL = c( - "5" = "Present v. 5 Years", - "10" = "Present v. 10 Years", - "5_10" = "5 Years v. 10 Years" - ))) + - labs( - x = "Order", - y = "Mean absolute deviation", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_temporal_time_interval) - -# (Two-way interaction plots removed as requested) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Estimated Marginal Means only) -# ============================================================================= - -# Create emmeans for TIME main effect -emm_time_main <- emmeans(aov_model, ~ TIME) - -# Prepare emmeans data frame -emmeans_time_main <- emm_time_main %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Plot TIME main effect (only emmeans + error bars) -plot_time_main_effect <- ggplot(emmeans_time_main) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.9 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Temporal Direction", - y = "Absolute difference from the present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(plot_time_main_effect) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251010141129.r b/.history/eohi2/mixed anova - DGEN_20251010141129.r deleted file mode 100644 index 4b40414..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251010141129.r +++ /dev/null @@ -1,896 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") -cat("anova - dgen", getwd(), "\n") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for temporal_DO × TIME × INTERVAL interaction -print("=== COHEN'S D FOR temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_simple1_df <- as.data.frame(temporal_time_interval_simple1) -significant_temporal_time_interval <- temporal_time_interval_simple1_df[temporal_time_interval_simple1_df$p.value < 0.05, ] - -if(nrow(significant_temporal_time_interval) > 0) { - temporal_time_interval_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_temporal_time_interval))) { - comparison <- significant_temporal_time_interval[i, ] - - # Extract the grouping variables - temporal_level <- as.character(comparison$temporal_DO) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this temporal_DO × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - temporal_DO = temporal_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - temporal_time_interval_cohens_d <- rbind(temporal_time_interval_cohens_d, result_row) - } - } - - if(nrow(temporal_time_interval_cohens_d) > 0) { - print(temporal_time_interval_cohens_d) - } -} else { - cat("No significant TIME effects found within any temporal_DO × INTERVAL combination.\n") -} - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# COLOR PALETTE FOR PLOTS -# ============================================================================= - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (temporal_DO on x-axis, TIME in legend, INTERVAL as facets) -# ============================================================================= - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_temporal_time_interval_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame for the plot -emmeans_temporal_time_interval_plot <- emm_temporal_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create temporal_DO × TIME × INTERVAL interaction plot with facets -interaction_plot_temporal_time_interval <- ggplot(emmeans_temporal_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1, labeller = labeller(INTERVAL = c( - "5" = "Present v. 5 Years", - "10" = "Present v. 10 Years", - "5_10" = "5 Years v. 10 Years" - ))) + - labs( - x = "Order", - y = "Mean absolute deviation", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_temporal_time_interval) - -# (Two-way interaction plots removed as requested) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Estimated Marginal Means only) -# ============================================================================= - -# Create emmeans for TIME main effect -emm_time_main <- emmeans(aov_model, ~ TIME) - -# Prepare emmeans data frame -emmeans_time_main <- emm_time_main %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Plot TIME main effect (only emmeans + error bars) -plot_time_main_effect <- ggplot(emmeans_time_main) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.9 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Temporal Direction", - y = "Absolute difference from the present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(plot_time_main_effect) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251010160100.r b/.history/eohi2/mixed anova - DGEN_20251010160100.r deleted file mode 100644 index 4b40414..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251010160100.r +++ /dev/null @@ -1,896 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") -cat("anova - dgen", getwd(), "\n") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for temporal_DO × TIME × INTERVAL interaction -print("=== COHEN'S D FOR temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_simple1_df <- as.data.frame(temporal_time_interval_simple1) -significant_temporal_time_interval <- temporal_time_interval_simple1_df[temporal_time_interval_simple1_df$p.value < 0.05, ] - -if(nrow(significant_temporal_time_interval) > 0) { - temporal_time_interval_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_temporal_time_interval))) { - comparison <- significant_temporal_time_interval[i, ] - - # Extract the grouping variables - temporal_level <- as.character(comparison$temporal_DO) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this temporal_DO × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - temporal_DO = temporal_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - temporal_time_interval_cohens_d <- rbind(temporal_time_interval_cohens_d, result_row) - } - } - - if(nrow(temporal_time_interval_cohens_d) > 0) { - print(temporal_time_interval_cohens_d) - } -} else { - cat("No significant TIME effects found within any temporal_DO × INTERVAL combination.\n") -} - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# COLOR PALETTE FOR PLOTS -# ============================================================================= - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (temporal_DO on x-axis, TIME in legend, INTERVAL as facets) -# ============================================================================= - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_temporal_time_interval_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame for the plot -emmeans_temporal_time_interval_plot <- emm_temporal_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create temporal_DO × TIME × INTERVAL interaction plot with facets -interaction_plot_temporal_time_interval <- ggplot(emmeans_temporal_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1, labeller = labeller(INTERVAL = c( - "5" = "Present v. 5 Years", - "10" = "Present v. 10 Years", - "5_10" = "5 Years v. 10 Years" - ))) + - labs( - x = "Order", - y = "Mean absolute deviation", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_temporal_time_interval) - -# (Two-way interaction plots removed as requested) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Estimated Marginal Means only) -# ============================================================================= - -# Create emmeans for TIME main effect -emm_time_main <- emmeans(aov_model, ~ TIME) - -# Prepare emmeans data frame -emmeans_time_main <- emm_time_main %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Plot TIME main effect (only emmeans + error bars) -plot_time_main_effect <- ggplot(emmeans_time_main) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.9 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Temporal Direction", - y = "Absolute difference from the present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(plot_time_main_effect) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251010165028.r b/.history/eohi2/mixed anova - DGEN_20251010165028.r deleted file mode 100644 index 4b40414..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251010165028.r +++ /dev/null @@ -1,896 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") -cat("anova - dgen", getwd(), "\n") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for temporal_DO × TIME × INTERVAL interaction -print("=== COHEN'S D FOR temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_simple1_df <- as.data.frame(temporal_time_interval_simple1) -significant_temporal_time_interval <- temporal_time_interval_simple1_df[temporal_time_interval_simple1_df$p.value < 0.05, ] - -if(nrow(significant_temporal_time_interval) > 0) { - temporal_time_interval_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_temporal_time_interval))) { - comparison <- significant_temporal_time_interval[i, ] - - # Extract the grouping variables - temporal_level <- as.character(comparison$temporal_DO) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this temporal_DO × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - temporal_DO = temporal_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - temporal_time_interval_cohens_d <- rbind(temporal_time_interval_cohens_d, result_row) - } - } - - if(nrow(temporal_time_interval_cohens_d) > 0) { - print(temporal_time_interval_cohens_d) - } -} else { - cat("No significant TIME effects found within any temporal_DO × INTERVAL combination.\n") -} - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# COLOR PALETTE FOR PLOTS -# ============================================================================= - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (temporal_DO on x-axis, TIME in legend, INTERVAL as facets) -# ============================================================================= - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_temporal_time_interval_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame for the plot -emmeans_temporal_time_interval_plot <- emm_temporal_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create temporal_DO × TIME × INTERVAL interaction plot with facets -interaction_plot_temporal_time_interval <- ggplot(emmeans_temporal_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1, labeller = labeller(INTERVAL = c( - "5" = "Present v. 5 Years", - "10" = "Present v. 10 Years", - "5_10" = "5 Years v. 10 Years" - ))) + - labs( - x = "Order", - y = "Mean absolute deviation", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_temporal_time_interval) - -# (Two-way interaction plots removed as requested) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Estimated Marginal Means only) -# ============================================================================= - -# Create emmeans for TIME main effect -emm_time_main <- emmeans(aov_model, ~ TIME) - -# Prepare emmeans data frame -emmeans_time_main <- emm_time_main %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Plot TIME main effect (only emmeans + error bars) -plot_time_main_effect <- ggplot(emmeans_time_main) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.9 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Temporal Direction", - y = "Absolute difference from the present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(plot_time_main_effect) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - DGEN_20251010165032.r b/.history/eohi2/mixed anova - DGEN_20251010165032.r deleted file mode 100644 index 4b40414..0000000 --- a/.history/eohi2/mixed anova - DGEN_20251010165032.r +++ /dev/null @@ -1,896 +0,0 @@ -# Mixed ANOVA Analysis for DGEN Variables -# EOHI Experiment 2 Data Analysis - DGEN Level Analysis with TIME, DOMAIN, and INTERVAL factors -# Variables: DGEN_past_5_Pref, DGEN_past_5_Pers, DGEN_past_5_Val, -# DGEN_past_10_Pref, DGEN_past_10_Pers, DGEN_past_10_Val, -# DGEN_fut_5_Pref, DGEN_fut_5_Pers, DGEN_fut_5_Val, -# DGEN_fut_10_Pref, DGEN_fut_10_Pers, DGEN_fut_10_Val, -# X5_10DGEN_past_pref, X5_10DGEN_past_pers, X5_10DGEN_past_val, -# X5_10DGEN_fut_pref, X5_10DGEN_fut_pers, X5_10DGEN_fut_val - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") -cat("anova - dgen", getwd(), "\n") - -# Read the data -data <- read.csv("eohi2.csv") - -# Verify the specific variables we need -required_vars <- c("DGEN_past_5_Pref", "DGEN_past_5_Pers", "DGEN_past_5_Val", - "DGEN_past_10_Pref", "DGEN_past_10_Pers", "DGEN_past_10_Val", - "DGEN_fut_5_Pref", "DGEN_fut_5_Pers", "DGEN_fut_5_Val", - "DGEN_fut_10_Pref", "DGEN_fut_10_Pers", "DGEN_fut_10_Val", - "X5_10DGEN_past_pref", "X5_10DGEN_past_pers", "X5_10DGEN_past_val", - "X5_10DGEN_fut_pref", "X5_10DGEN_fut_pers", "X5_10DGEN_fut_val") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} - -# Define variable mapping for the three within-subjects factors -variable_mapping <- data.frame( - variable = required_vars, - TIME = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - DOMAIN = rep(c("Preferences", "Personality", "Values"), 6), - INTERVAL = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Variable mapping:") -print(variable_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "DGEN_SCORE" - ) %>% - left_join(variable_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(TIME, levels = c("Past", "Future")), - DOMAIN = factor(DOMAIN, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - pID = as.factor(ResponseId), # Use ResponseId as participant ID - temporal_DO = as.factor(TEMPORAL_DO), - interval_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(pID, ResponseId, temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL, DGEN_SCORE) %>% - filter(!is.na(DGEN_SCORE)) - - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - median = round(median(DGEN_SCORE, na.rm = TRUE), 5), - q1 = round(quantile(DGEN_SCORE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(DGEN_SCORE, 0.75, na.rm = TRUE), 5), - min = round(min(DGEN_SCORE, na.rm = TRUE), 5), - max = round(max(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(temporal_DO, interval_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(DGEN_SCORE, na.rm = TRUE), 5), - variance = round(var(DGEN_SCORE, na.rm = TRUE), 5), - sd = round(sd(DGEN_SCORE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Calculate mean differences for key comparisons -print("\n=== KEY MEAN DIFFERENCES ===") - -# Past vs Future differences for each DOMAIN × INTERVAL combination -past_future_diffs <- long_data %>% - group_by(DOMAIN, INTERVAL, pID) %>% - summarise( - past_score = DGEN_SCORE[TIME == "Past"], - future_score = DGEN_SCORE[TIME == "Future"], - difference = past_score - future_score, - .groups = 'drop' - ) %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("Past vs Future differences by DOMAIN × INTERVAL:") -print(past_future_diffs) - -# 5 vs 10 interval differences for each TIME × DOMAIN combination -interval_diffs <- long_data %>% - group_by(TIME, DOMAIN, pID) %>% - summarise( - interval_5_score = DGEN_SCORE[INTERVAL == "5"], - interval_10_score = DGEN_SCORE[INTERVAL == "10"], - difference = interval_5_score - interval_10_score, - .groups = 'drop' - ) %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean_diff = round(mean(difference, na.rm = TRUE), 5), - sd_diff = round(sd(difference, na.rm = TRUE), 5), - se_diff = round(sd(difference, na.rm = TRUE) / sqrt(n()), 5), - .groups = 'drop' - ) - -print("\n5 vs 10 interval differences by TIME × DOMAIN:") -print(interval_diffs) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$DGEN_SCORE), ] - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(DGEN_SCORE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(DGEN_SCORE), - sd = sd(DGEN_SCORE), - q1 = quantile(DGEN_SCORE, 0.25), - q3 = quantile(DGEN_SCORE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(DGEN_SCORE < lower_bound | DGEN_SCORE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$DGEN_SCORE)$statistic, - ad_p_value = ad.test(.data$DGEN_SCORE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance tests -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ TIME)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(DGEN_SCORE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(DGEN_SCORE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN:") -print(homogeneity_interval) - -# ============================================================================= -# HARTLEY'S F-MAX TEST WITH BOOTSTRAP CRITICAL VALUES -# ============================================================================= - -# More efficient bootstrap function for Hartley's F-max test -bootstrap_hartley_critical <- function(data, group_var, response_var, n_iter = 1000) { - # Get unique groups and their sample sizes - groups <- unique(data[[group_var]]) - - # Calculate observed variances for each group - observed_vars <- data %>% - dplyr::group_by(!!rlang::sym(group_var)) %>% - dplyr::summarise(var = var(!!rlang::sym(response_var), na.rm = TRUE), .groups = 'drop') %>% - dplyr::pull(var) - - # Handle invalid variances - if(any(observed_vars <= 0 | is.na(observed_vars))) { - observed_vars[observed_vars <= 0 | is.na(observed_vars)] <- 1e-10 - } - - # Calculate observed F-max ratio - observed_ratio <- max(observed_vars) / min(observed_vars) - - # Pre-allocate storage for bootstrap ratios - bootstrap_ratios <- numeric(n_iter) - - # Get group data once - group_data_list <- map(groups, ~ { - group_data <- data[data[[group_var]] == .x, response_var] - group_data[!is.na(group_data)] - }) - - # Bootstrap with pre-allocated storage - for(i in 1:n_iter) { - # Bootstrap sample from each group independently - sample_vars <- map_dbl(group_data_list, ~ { - bootstrap_sample <- sample(.x, size = length(.x), replace = TRUE) - var(bootstrap_sample, na.rm = TRUE) - }) - bootstrap_ratios[i] <- max(sample_vars) / min(sample_vars) - } - - # Remove invalid ratios - valid_ratios <- bootstrap_ratios[is.finite(bootstrap_ratios) & !is.na(bootstrap_ratios)] - - if(length(valid_ratios) == 0) { - stop("No valid bootstrap ratios generated") - } - - # Calculate critical value (95th percentile) - critical_95 <- quantile(valid_ratios, 0.95, na.rm = TRUE) - - # Return only essential information - return(list( - observed_ratio = observed_ratio, - critical_95 = critical_95, - n_valid_iterations = length(valid_ratios) - )) -} - -# Hartley's F-max test across between-subjects factors within each within-subjects combination -print("\n=== HARTLEY'S F-MAX TEST RESULTS ===") -set.seed(123) # For reproducibility - -# Test across temporal_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across temporal_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_temporal_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(temporal_DO, DGEN_SCORE), "temporal_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_temporal_results) - -# Test across interval_DO within each TIME × DOMAIN × INTERVAL combination -print("F-max test across interval_DO within each TIME × DOMAIN × INTERVAL combination:") -hartley_interval_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - hartley_result = list(bootstrap_hartley_critical(pick(interval_DO, DGEN_SCORE), "interval_DO", "DGEN_SCORE")), - .groups = 'drop' - ) %>% - mutate( - observed_ratio = map_dbl(hartley_result, ~ .x$observed_ratio), - critical_95 = map_dbl(hartley_result, ~ .x$critical_95), - significant = observed_ratio > critical_95 - ) %>% - select(TIME, DOMAIN, INTERVAL, observed_ratio, critical_95, significant) - -print(hartley_interval_results) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check if design is balanced -design_balance <- table(long_data_clean$pID, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: temporal_DO (2 levels) × interval_DO (2 levels) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = DGEN_SCORE, - wid = pID, - between = .(temporal_DO, interval_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -print("Mauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("Greenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - print("=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("=== CORRECTED F-TESTS ===") - corrected_results <- data.frame( - Effect = corrected_df$Effect, - Original_F = anova_table$F[match(corrected_df$Effect, anova_table$Effect)], - Original_DFn = corrected_df$Original_DFn, - Original_DFd = corrected_df$Original_DFd, - GG_DFn = corrected_df$GG_DFn, - GG_DFd = corrected_df$GG_DFd, - HF_DFn = corrected_df$HF_DFn, - HF_DFd = corrected_df$HF_DFd, - GG_p = sphericity_corr$`p[GG]`, - HF_p = sphericity_corr$`p[HF]` - ) - print(corrected_results) -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -print("=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(DGEN_SCORE ~ temporal_DO * interval_DO * TIME * DOMAIN * INTERVAL + Error(pID/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effects -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print(time_emmeans) -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -print("Main Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print(domain_emmeans) -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -print("Main Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print(interval_emmeans) -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -print("Main Effect of temporal_DO:") -temporal_emmeans <- emmeans(aov_model, ~ temporal_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -print("Main Effect of interval_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ interval_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# TWO-WAY INTERACTION EXPLORATIONS -# ============================================================================= - -# TIME × DOMAIN Interaction -print("=== TIME × DOMAIN INTERACTION ===") -time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -print(time_domain_emmeans) -time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -print(time_domain_simple) -time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(time_domain_simple2) - -# TIME × INTERVAL Interaction -print("=== TIME × INTERVAL INTERACTION ===") -time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) -print(time_interval_emmeans) -time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") -print(time_interval_simple) -time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(time_interval_simple2) - -# DOMAIN × INTERVAL Interaction -print("=== DOMAIN × INTERVAL INTERACTION ===") -domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) -print(domain_interval_emmeans) -domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") -print(domain_interval_simple) -domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") -print(domain_interval_simple2) - -# Between-subjects interactions -print("=== temporal_DO × interval_DO INTERACTION ===") -temporal_interval_do_emmeans <- emmeans(aov_model, ~ temporal_DO * interval_DO) -print(temporal_interval_do_emmeans) -temporal_interval_do_simple <- pairs(temporal_interval_do_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_interval_do_simple) -temporal_interval_do_simple2 <- pairs(temporal_interval_do_emmeans, by = "interval_DO", adjust = "bonferroni") -print(temporal_interval_do_simple2) - -# ============================================================================= -# THREE-WAY INTERACTION ANALYSES -# ============================================================================= - -# TIME × DOMAIN × INTERVAL Interaction -print("=== TIME × DOMAIN × INTERVAL INTERACTION ===") -three_way_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN * INTERVAL) -print(three_way_emmeans) -three_way_contrasts <- pairs(three_way_emmeans, by = c("DOMAIN", "INTERVAL"), adjust = "bonferroni") -print(three_way_contrasts) - -# Between-subjects × within-subjects interactions -print("=== temporal_DO × TIME INTERACTION ===") -temporal_time_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME) -print(temporal_time_emmeans) -temporal_time_simple <- pairs(temporal_time_emmeans, by = "temporal_DO", adjust = "bonferroni") -print(temporal_time_simple) -temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") -print(temporal_time_simple2) - -# temporal_DO × TIME × INTERVAL three-way interaction -print("=== temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_emmeans <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) -print(temporal_time_interval_emmeans) - -# Simple effects of TIME within each temporal_DO × INTERVAL combination -temporal_time_interval_simple1 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of TIME within each temporal_DO × INTERVAL:") -print(temporal_time_interval_simple1) - -# Simple effects of temporal_DO within each TIME × INTERVAL combination -temporal_time_interval_simple2 <- pairs(temporal_time_interval_emmeans, by = c("TIME", "INTERVAL"), adjust = "bonferroni") -print("Simple effects of temporal_DO within each TIME × INTERVAL:") -print(temporal_time_interval_simple2) - -# Simple effects of INTERVAL within each temporal_DO × TIME combination -temporal_time_interval_simple3 <- pairs(temporal_time_interval_emmeans, by = c("temporal_DO", "TIME"), adjust = "bonferroni") -print("Simple effects of INTERVAL within each temporal_DO × TIME:") -print(temporal_time_interval_simple3) - -# ============================================================================= -# COHEN'S D CALCULATIONS FOR SIGNIFICANT EFFECTS -# ============================================================================= - -print("=== COHEN'S D FOR SIGNIFICANT EFFECTS ===") - -# Function to calculate Cohen's d for pairwise comparisons -calculate_cohens_d_for_pairs <- function(pairs_df, data, group1_var, group2_var, response_var) { - significant_pairs <- pairs_df[pairs_df$p.value < 0.05, ] - - if(nrow(significant_pairs) > 0) { - print(significant_pairs) - - # Calculate Cohen's d for all significant pairs - cohens_d_results <- data.frame() - - for(i in seq_len(nrow(significant_pairs))) { - comparison <- significant_pairs[i, ] - contrast_name <- as.character(comparison$contrast) - - # Parse the contrast - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - # Get raw data for both conditions - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - group2_level <- as.character(comparison[[group2_var]]) - data1 <- data[[response_var]][ - data[[group1_var]] == level1 & - data[[group2_var]] == group2_level] - - data2 <- data[[response_var]][ - data[[group1_var]] == level2 & - data[[group2_var]] == group2_level] - } else { - data1 <- data[[response_var]][data[[group1_var]] == level1] - data2 <- data[[response_var]][data[[group1_var]] == level2] - } - - if(length(data1) > 0 && length(data2) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(data1, data2) - - result_row <- data.frame( - Comparison = contrast_name, - n1 = length(data1), - n2 = length(data2), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5) - ) - - if(!is.null(group2_var) && group2_var %in% colnames(comparison)) { - result_row$Group_level <- group2_level - } - - cohens_d_results <- rbind(cohens_d_results, result_row) - } - } - } - - if(nrow(cohens_d_results) > 0) { - print(cohens_d_results) - } - } else { - cat("No significant pairwise comparisons found.\n") - } -} - -# Calculate Cohen's d for main effects -time_contrasts_df <- as.data.frame(time_contrasts) -calculate_cohens_d_for_pairs(time_contrasts_df, long_data_clean, "TIME", NULL, "DGEN_SCORE") - -domain_contrasts_df <- as.data.frame(domain_contrasts) -calculate_cohens_d_for_pairs(domain_contrasts_df, long_data_clean, "DOMAIN", NULL, "DGEN_SCORE") - -interval_contrasts_df <- as.data.frame(interval_contrasts) -calculate_cohens_d_for_pairs(interval_contrasts_df, long_data_clean, "INTERVAL", NULL, "DGEN_SCORE") - -# Calculate Cohen's d for two-way interactions -time_domain_simple_df <- as.data.frame(time_domain_simple) -calculate_cohens_d_for_pairs(time_domain_simple_df, long_data_clean, "DOMAIN", "TIME", "DGEN_SCORE") - -time_domain_simple2_df <- as.data.frame(time_domain_simple2) -calculate_cohens_d_for_pairs(time_domain_simple2_df, long_data_clean, "TIME", "DOMAIN", "DGEN_SCORE") - -# Calculate Cohen's d for temporal_DO × TIME × INTERVAL interaction -print("=== COHEN'S D FOR temporal_DO × TIME × INTERVAL INTERACTION ===") -temporal_time_interval_simple1_df <- as.data.frame(temporal_time_interval_simple1) -significant_temporal_time_interval <- temporal_time_interval_simple1_df[temporal_time_interval_simple1_df$p.value < 0.05, ] - -if(nrow(significant_temporal_time_interval) > 0) { - temporal_time_interval_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_temporal_time_interval))) { - comparison <- significant_temporal_time_interval[i, ] - - # Extract the grouping variables - temporal_level <- as.character(comparison$temporal_DO) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this temporal_DO × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$temporal_DO == temporal_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - temporal_DO = temporal_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - temporal_time_interval_cohens_d <- rbind(temporal_time_interval_cohens_d, result_row) - } - } - - if(nrow(temporal_time_interval_cohens_d) > 0) { - print(temporal_time_interval_cohens_d) - } -} else { - cat("No significant TIME effects found within any temporal_DO × INTERVAL combination.\n") -} - -# Calculate Cohen's d for three-way interaction -three_way_contrasts_df <- as.data.frame(three_way_contrasts) -significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - -if(nrow(significant_three_way) > 0) { - three_way_cohens_d <- data.frame() - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - domain_level <- as.character(comparison$DOMAIN) - interval_level <- as.character(comparison$INTERVAL) - - # Get data for Past and Future within this DOMAIN × INTERVAL combination - past_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$DGEN_SCORE[ - long_data_clean$DOMAIN == domain_level & - long_data_clean$INTERVAL == interval_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - result_row <- data.frame( - DOMAIN = domain_level, - INTERVAL = interval_level, - Comparison = "Past vs Future", - n_Past = length(past_data), - n_Future = length(future_data), - Cohens_d = round(cohens_d_result$estimate, 5), - Effect_size = cohens_d_result$magnitude, - p_value = round(comparison$p.value, 5), - Estimated_difference = round(comparison$estimate, 5) - ) - - three_way_cohens_d <- rbind(three_way_cohens_d, result_row) - } - } - - if(nrow(three_way_cohens_d) > 0) { - print(three_way_cohens_d) - } -} else { - cat("No significant TIME effects found within any DOMAIN × INTERVAL combination.\n") -} - -# ============================================================================= -# COLOR PALETTE FOR PLOTS -# ============================================================================= - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# ============================================================================= -# INTERACTION PLOT: temporal_DO × TIME × INTERVAL (temporal_DO on x-axis, TIME in legend, INTERVAL as facets) -# ============================================================================= - -# Create emmeans for temporal_DO × TIME × INTERVAL -emm_temporal_time_interval_plot <- emmeans(aov_model, ~ temporal_DO * TIME * INTERVAL) - -# Prepare emmeans data frame for the plot -emmeans_temporal_time_interval_plot <- emm_temporal_time_interval_plot %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - temporal_DO = factor(temporal_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL, levels = c("5", "10", "5_10")), - x_pos = as.numeric(temporal_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create temporal_DO × TIME × INTERVAL interaction plot with facets -interaction_plot_temporal_time_interval <- ggplot(emmeans_temporal_time_interval_plot) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1, labeller = labeller(INTERVAL = c( - "5" = "Present v. 5 Years", - "10" = "Present v. 10 Years", - "5_10" = "5 Years v. 10 Years" - ))) + - labs( - x = "Order", - y = "Mean absolute deviation", - title = "temporal_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_temporal_time_interval) - -# (Two-way interaction plots removed as requested) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Estimated Marginal Means only) -# ============================================================================= - -# Create emmeans for TIME main effect -emm_time_main <- emmeans(aov_model, ~ TIME) - -# Prepare emmeans data frame -emmeans_time_main <- emm_time_main %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Plot TIME main effect (only emmeans + error bars) -plot_time_main_effect <- ggplot(emmeans_time_main) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.9 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Temporal Direction", - y = "Absolute difference from the present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(plot_time_main_effect) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003143914.r b/.history/eohi2/mixed anova - domain means_20251003143914.r deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi2/mixed anova - domain means_20251003143942.r b/.history/eohi2/mixed anova - domain means_20251003143942.r deleted file mode 100644 index cf12456..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003143942.r +++ /dev/null @@ -1,21 +0,0 @@ -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003145806.r b/.history/eohi2/mixed anova - domain means_20251003145806.r deleted file mode 100644 index 6096dab..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003145806.r +++ /dev/null @@ -1,88 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "5.10past_pref_MEAN", "5.10past_pers_MEAN", "5.10past_val_MEAN", - "5.10fut_pref_MEAN", "5.10fut_pers_MEAN", "5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003145820.r b/.history/eohi2/mixed anova - domain means_20251003145820.r deleted file mode 100644 index f17da36..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003145820.r +++ /dev/null @@ -1,139 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "5.10past_pref_MEAN", "5.10past_pers_MEAN", "5.10past_val_MEAN", - "5.10fut_pref_MEAN", "5.10fut_pers_MEAN", "5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003145845.r b/.history/eohi2/mixed anova - domain means_20251003145845.r deleted file mode 100644 index 9f268b0..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003145845.r +++ /dev/null @@ -1,280 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "5.10past_pref_MEAN", "5.10past_pers_MEAN", "5.10past_val_MEAN", - "5.10fut_pref_MEAN", "5.10fut_pers_MEAN", "5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003145918.r b/.history/eohi2/mixed anova - domain means_20251003145918.r deleted file mode 100644 index 159f53f..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003145918.r +++ /dev/null @@ -1,434 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "5.10past_pref_MEAN", "5.10past_pers_MEAN", "5.10past_val_MEAN", - "5.10fut_pref_MEAN", "5.10fut_pers_MEAN", "5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - cat(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003145949.r b/.history/eohi2/mixed anova - domain means_20251003145949.r deleted file mode 100644 index 2155e21..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003145949.r +++ /dev/null @@ -1,611 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "5.10past_pref_MEAN", "5.10past_pers_MEAN", "5.10past_val_MEAN", - "5.10fut_pref_MEAN", "5.10fut_pers_MEAN", "5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - cat(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - cat("\nCohen's d for significant INTERVAL contrasts:\n") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", interval_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", interval_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_interval$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -# Note: Detailed interaction analyses would be added here if significant interactions are found -# For now, we'll provide a framework for the most common interactions - -print("\n=== INTERACTION EXPLORATIONS ===") -print("Note: Detailed interaction analyses will be performed for significant interactions") -print("Check the ANOVA results above to identify which interactions are significant") - -# Example framework for TIME × DOMAIN interaction (if significant) -# if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { -# print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") -# time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -# print("Estimated Marginal Means:") -# print(time_domain_emmeans) -# -# print("\nSimple Effects of DOMAIN within each TIME:") -# time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -# print(time_domain_simple) -# -# print("\nSimple Effects of TIME within each DOMAIN:") -# time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -# print(time_domain_simple2) -# } - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003145955.r b/.history/eohi2/mixed anova - domain means_20251003145955.r deleted file mode 100644 index 2155e21..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003145955.r +++ /dev/null @@ -1,611 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "5.10past_pref_MEAN", "5.10past_pers_MEAN", "5.10past_val_MEAN", - "5.10fut_pref_MEAN", "5.10fut_pers_MEAN", "5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - cat(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - cat("\nCohen's d for significant INTERVAL contrasts:\n") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", interval_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", interval_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_interval$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -# Note: Detailed interaction analyses would be added here if significant interactions are found -# For now, we'll provide a framework for the most common interactions - -print("\n=== INTERACTION EXPLORATIONS ===") -print("Note: Detailed interaction analyses will be performed for significant interactions") -print("Check the ANOVA results above to identify which interactions are significant") - -# Example framework for TIME × DOMAIN interaction (if significant) -# if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { -# print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") -# time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -# print("Estimated Marginal Means:") -# print(time_domain_emmeans) -# -# print("\nSimple Effects of DOMAIN within each TIME:") -# time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -# print(time_domain_simple) -# -# print("\nSimple Effects of TIME within each DOMAIN:") -# time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -# print(time_domain_simple2) -# } - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003150004.r b/.history/eohi2/mixed anova - domain means_20251003150004.r deleted file mode 100644 index 2155e21..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003150004.r +++ /dev/null @@ -1,611 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "5.10past_pref_MEAN", "5.10past_pers_MEAN", "5.10past_val_MEAN", - "5.10fut_pref_MEAN", "5.10fut_pers_MEAN", "5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - cat("\n=== CORRECTED DEGREES OF FREEDOM ===\n") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - cat(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - cat("\nCohen's d for significant INTERVAL contrasts:\n") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", interval_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", interval_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_interval$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -# Note: Detailed interaction analyses would be added here if significant interactions are found -# For now, we'll provide a framework for the most common interactions - -print("\n=== INTERACTION EXPLORATIONS ===") -print("Note: Detailed interaction analyses will be performed for significant interactions") -print("Check the ANOVA results above to identify which interactions are significant") - -# Example framework for TIME × DOMAIN interaction (if significant) -# if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { -# print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") -# time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -# print("Estimated Marginal Means:") -# print(time_domain_emmeans) -# -# print("\nSimple Effects of DOMAIN within each TIME:") -# time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -# print(time_domain_simple) -# -# print("\nSimple Effects of TIME within each DOMAIN:") -# time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -# print(time_domain_simple2) -# } - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003150017.r b/.history/eohi2/mixed anova - domain means_20251003150017.r deleted file mode 100644 index 33a17bb..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003150017.r +++ /dev/null @@ -1,611 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "5.10past_pref_MEAN", "5.10past_pers_MEAN", "5.10past_val_MEAN", - "5.10fut_pref_MEAN", "5.10fut_pers_MEAN", "5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - cat("\n=== CORRECTED F-TESTS ===\n") - - # Between-subjects effects (no sphericity corrections needed) - cat("\nBETWEEN-SUBJECTS EFFECTS:\n") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - cat(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - cat("\nCohen's d for significant INTERVAL contrasts:\n") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", interval_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", interval_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_interval$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -# Note: Detailed interaction analyses would be added here if significant interactions are found -# For now, we'll provide a framework for the most common interactions - -print("\n=== INTERACTION EXPLORATIONS ===") -print("Note: Detailed interaction analyses will be performed for significant interactions") -print("Check the ANOVA results above to identify which interactions are significant") - -# Example framework for TIME × DOMAIN interaction (if significant) -# if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { -# print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") -# time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -# print("Estimated Marginal Means:") -# print(time_domain_emmeans) -# -# print("\nSimple Effects of DOMAIN within each TIME:") -# time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -# print(time_domain_simple) -# -# print("\nSimple Effects of TIME within each DOMAIN:") -# time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -# print(time_domain_simple2) -# } - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003150021.r b/.history/eohi2/mixed anova - domain means_20251003150021.r deleted file mode 100644 index e9b5cf7..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003150021.r +++ /dev/null @@ -1,611 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "5.10past_pref_MEAN", "5.10past_pers_MEAN", "5.10past_val_MEAN", - "5.10fut_pref_MEAN", "5.10fut_pers_MEAN", "5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - cat("\nWITHIN-SUBJECTS EFFECTS:\n") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - cat(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - cat("\nCohen's d for significant INTERVAL contrasts:\n") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", interval_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", interval_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_interval$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -# Note: Detailed interaction analyses would be added here if significant interactions are found -# For now, we'll provide a framework for the most common interactions - -print("\n=== INTERACTION EXPLORATIONS ===") -print("Note: Detailed interaction analyses will be performed for significant interactions") -print("Check the ANOVA results above to identify which interactions are significant") - -# Example framework for TIME × DOMAIN interaction (if significant) -# if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { -# print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") -# time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -# print("Estimated Marginal Means:") -# print(time_domain_emmeans) -# -# print("\nSimple Effects of DOMAIN within each TIME:") -# time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -# print(time_domain_simple) -# -# print("\nSimple Effects of TIME within each DOMAIN:") -# time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -# print(time_domain_simple2) -# } - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003150023.r b/.history/eohi2/mixed anova - domain means_20251003150023.r deleted file mode 100644 index 40596e5..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003150023.r +++ /dev/null @@ -1,611 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "5.10past_pref_MEAN", "5.10past_pers_MEAN", "5.10past_val_MEAN", - "5.10fut_pref_MEAN", "5.10fut_pers_MEAN", "5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - cat(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - cat("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:\n") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - cat("\nCohen's d for significant INTERVAL contrasts:\n") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", interval_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", interval_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_interval$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -# Note: Detailed interaction analyses would be added here if significant interactions are found -# For now, we'll provide a framework for the most common interactions - -print("\n=== INTERACTION EXPLORATIONS ===") -print("Note: Detailed interaction analyses will be performed for significant interactions") -print("Check the ANOVA results above to identify which interactions are significant") - -# Example framework for TIME × DOMAIN interaction (if significant) -# if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { -# print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") -# time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -# print("Estimated Marginal Means:") -# print(time_domain_emmeans) -# -# print("\nSimple Effects of DOMAIN within each TIME:") -# time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -# print(time_domain_simple) -# -# print("\nSimple Effects of TIME within each DOMAIN:") -# time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -# print(time_domain_simple2) -# } - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003150026.r b/.history/eohi2/mixed anova - domain means_20251003150026.r deleted file mode 100644 index 463142f..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003150026.r +++ /dev/null @@ -1,611 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "5.10past_pref_MEAN", "5.10past_pers_MEAN", "5.10past_val_MEAN", - "5.10fut_pref_MEAN", "5.10fut_pers_MEAN", "5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - cat(sprintf("%s: F(%d, %d) = %.3f, p = %.6f\n", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - cat(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - cat("\nCohen's d for significant INTERVAL contrasts:\n") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", interval_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", interval_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_interval$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -# Note: Detailed interaction analyses would be added here if significant interactions are found -# For now, we'll provide a framework for the most common interactions - -print("\n=== INTERACTION EXPLORATIONS ===") -print("Note: Detailed interaction analyses will be performed for significant interactions") -print("Check the ANOVA results above to identify which interactions are significant") - -# Example framework for TIME × DOMAIN interaction (if significant) -# if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { -# print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") -# time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -# print("Estimated Marginal Means:") -# print(time_domain_emmeans) -# -# print("\nSimple Effects of DOMAIN within each TIME:") -# time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -# print(time_domain_simple) -# -# print("\nSimple Effects of TIME within each DOMAIN:") -# time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -# print(time_domain_simple2) -# } - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003150029.r b/.history/eohi2/mixed anova - domain means_20251003150029.r deleted file mode 100644 index 4990d92..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003150029.r +++ /dev/null @@ -1,611 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "5.10past_pref_MEAN", "5.10past_pers_MEAN", "5.10past_val_MEAN", - "5.10fut_pref_MEAN", "5.10fut_pers_MEAN", "5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - cat(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)\n", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - cat(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - cat("\nCohen's d for significant INTERVAL contrasts:\n") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", interval_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", interval_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_interval$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -# Note: Detailed interaction analyses would be added here if significant interactions are found -# For now, we'll provide a framework for the most common interactions - -print("\n=== INTERACTION EXPLORATIONS ===") -print("Note: Detailed interaction analyses will be performed for significant interactions") -print("Check the ANOVA results above to identify which interactions are significant") - -# Example framework for TIME × DOMAIN interaction (if significant) -# if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { -# print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") -# time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -# print("Estimated Marginal Means:") -# print(time_domain_emmeans) -# -# print("\nSimple Effects of DOMAIN within each TIME:") -# time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -# print(time_domain_simple) -# -# print("\nSimple Effects of TIME within each DOMAIN:") -# time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -# print(time_domain_simple2) -# } - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003150032.r b/.history/eohi2/mixed anova - domain means_20251003150032.r deleted file mode 100644 index 0128155..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003150032.r +++ /dev/null @@ -1,611 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "5.10past_pref_MEAN", "5.10past_pers_MEAN", "5.10past_val_MEAN", - "5.10fut_pref_MEAN", "5.10fut_pers_MEAN", "5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - cat(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - cat(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - cat("\nCohen's d for significant INTERVAL contrasts:\n") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", interval_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", interval_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_interval$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -# Note: Detailed interaction analyses would be added here if significant interactions are found -# For now, we'll provide a framework for the most common interactions - -print("\n=== INTERACTION EXPLORATIONS ===") -print("Note: Detailed interaction analyses will be performed for significant interactions") -print("Check the ANOVA results above to identify which interactions are significant") - -# Example framework for TIME × DOMAIN interaction (if significant) -# if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { -# print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") -# time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -# print("Estimated Marginal Means:") -# print(time_domain_emmeans) -# -# print("\nSimple Effects of DOMAIN within each TIME:") -# time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -# print(time_domain_simple) -# -# print("\nSimple Effects of TIME within each DOMAIN:") -# time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -# print(time_domain_simple2) -# } - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003150034.r b/.history/eohi2/mixed anova - domain means_20251003150034.r deleted file mode 100644 index 0cfef52..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003150034.r +++ /dev/null @@ -1,611 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "5.10past_pref_MEAN", "5.10past_pers_MEAN", "5.10past_val_MEAN", - "5.10fut_pref_MEAN", "5.10fut_pers_MEAN", "5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - cat(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f\n", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - cat("\nCohen's d for significant INTERVAL contrasts:\n") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", interval_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", interval_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_interval$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -# Note: Detailed interaction analyses would be added here if significant interactions are found -# For now, we'll provide a framework for the most common interactions - -print("\n=== INTERACTION EXPLORATIONS ===") -print("Note: Detailed interaction analyses will be performed for significant interactions") -print("Check the ANOVA results above to identify which interactions are significant") - -# Example framework for TIME × DOMAIN interaction (if significant) -# if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { -# print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") -# time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -# print("Estimated Marginal Means:") -# print(time_domain_emmeans) -# -# print("\nSimple Effects of DOMAIN within each TIME:") -# time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -# print(time_domain_simple) -# -# print("\nSimple Effects of TIME within each DOMAIN:") -# time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -# print(time_domain_simple2) -# } - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003150037.r b/.history/eohi2/mixed anova - domain means_20251003150037.r deleted file mode 100644 index 7c21993..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003150037.r +++ /dev/null @@ -1,611 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "5.10past_pref_MEAN", "5.10past_pers_MEAN", "5.10past_val_MEAN", - "5.10fut_pref_MEAN", "5.10fut_pers_MEAN", "5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - cat(sprintf("\n%s:\n", effect)) - cat(sprintf(" Original: F(%d, %d) = %.3f\n", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - cat(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - cat(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f\n", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - cat("\nCohen's d for significant INTERVAL contrasts:\n") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", interval_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", interval_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_interval$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -# Note: Detailed interaction analyses would be added here if significant interactions are found -# For now, we'll provide a framework for the most common interactions - -print("\n=== INTERACTION EXPLORATIONS ===") -print("Note: Detailed interaction analyses will be performed for significant interactions") -print("Check the ANOVA results above to identify which interactions are significant") - -# Example framework for TIME × DOMAIN interaction (if significant) -# if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { -# print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") -# time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -# print("Estimated Marginal Means:") -# print(time_domain_emmeans) -# -# print("\nSimple Effects of DOMAIN within each TIME:") -# time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -# print(time_domain_simple) -# -# print("\nSimple Effects of TIME within each DOMAIN:") -# time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -# print(time_domain_simple2) -# } - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003150049.r b/.history/eohi2/mixed anova - domain means_20251003150049.r deleted file mode 100644 index c455a75..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003150049.r +++ /dev/null @@ -1,611 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "5.10past_pref_MEAN", "5.10past_pers_MEAN", "5.10past_val_MEAN", - "5.10fut_pref_MEAN", "5.10fut_pers_MEAN", "5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - cat("\nCohen's d for TIME main effect:\n") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - cat(sprintf("Past vs Future: n1 = %d, n2 = %d\n", length(time_past_data), length(time_future_data))) - cat(sprintf("Cohen's d: %.5f\n", time_cohens_d$estimate)) - cat(sprintf("Effect size interpretation: %s\n", time_cohens_d$magnitude)) - cat(sprintf("p-value: %.5f\n", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - cat("\nCohen's d for significant INTERVAL contrasts:\n") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", interval_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", interval_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_interval$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -# Note: Detailed interaction analyses would be added here if significant interactions are found -# For now, we'll provide a framework for the most common interactions - -print("\n=== INTERACTION EXPLORATIONS ===") -print("Note: Detailed interaction analyses will be performed for significant interactions") -print("Check the ANOVA results above to identify which interactions are significant") - -# Example framework for TIME × DOMAIN interaction (if significant) -# if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { -# print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") -# time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -# print("Estimated Marginal Means:") -# print(time_domain_emmeans) -# -# print("\nSimple Effects of DOMAIN within each TIME:") -# time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -# print(time_domain_simple) -# -# print("\nSimple Effects of TIME within each DOMAIN:") -# time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -# print(time_domain_simple2) -# } - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003150056.r b/.history/eohi2/mixed anova - domain means_20251003150056.r deleted file mode 100644 index a87c5df..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003150056.r +++ /dev/null @@ -1,611 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "5.10past_pref_MEAN", "5.10past_pers_MEAN", "5.10past_val_MEAN", - "5.10fut_pref_MEAN", "5.10fut_pers_MEAN", "5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - cat("\nCohen's d for significant DOMAIN contrasts:\n") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", domain_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", domain_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_domain$p.value[i])) - cat("\n") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - cat("\nCohen's d for significant INTERVAL contrasts:\n") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", interval_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", interval_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_interval$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -# Note: Detailed interaction analyses would be added here if significant interactions are found -# For now, we'll provide a framework for the most common interactions - -print("\n=== INTERACTION EXPLORATIONS ===") -print("Note: Detailed interaction analyses will be performed for significant interactions") -print("Check the ANOVA results above to identify which interactions are significant") - -# Example framework for TIME × DOMAIN interaction (if significant) -# if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { -# print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") -# time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -# print("Estimated Marginal Means:") -# print(time_domain_emmeans) -# -# print("\nSimple Effects of DOMAIN within each TIME:") -# time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -# print(time_domain_simple) -# -# print("\nSimple Effects of TIME within each DOMAIN:") -# time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -# print(time_domain_simple2) -# } - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003150107.r b/.history/eohi2/mixed anova - domain means_20251003150107.r deleted file mode 100644 index 241abc1..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003150107.r +++ /dev/null @@ -1,611 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "5.10past_pref_MEAN", "5.10past_pers_MEAN", "5.10past_val_MEAN", - "5.10fut_pref_MEAN", "5.10fut_pers_MEAN", "5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - cat("\nCohen's d for significant INTERVAL contrasts:\n") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - cat(sprintf("Comparison: %s\n", contrast_name)) - cat(sprintf(" n1 = %d, n2 = %d\n", length(data1), length(data2))) - cat(sprintf(" Cohen's d: %.5f\n", interval_cohens_d$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", interval_cohens_d$magnitude)) - cat(sprintf(" p-value: %.5f\n", significant_interval$p.value[i])) - cat("\n") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -# Note: Detailed interaction analyses would be added here if significant interactions are found -# For now, we'll provide a framework for the most common interactions - -print("\n=== INTERACTION EXPLORATIONS ===") -print("Note: Detailed interaction analyses will be performed for significant interactions") -print("Check the ANOVA results above to identify which interactions are significant") - -# Example framework for TIME × DOMAIN interaction (if significant) -# if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { -# print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") -# time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -# print("Estimated Marginal Means:") -# print(time_domain_emmeans) -# -# print("\nSimple Effects of DOMAIN within each TIME:") -# time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -# print(time_domain_simple) -# -# print("\nSimple Effects of TIME within each DOMAIN:") -# time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -# print(time_domain_simple2) -# } - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003150118.r b/.history/eohi2/mixed anova - domain means_20251003150118.r deleted file mode 100644 index 69ab2d6..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003150118.r +++ /dev/null @@ -1,611 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "5.10past_pref_MEAN", "5.10past_pers_MEAN", "5.10past_val_MEAN", - "5.10fut_pref_MEAN", "5.10fut_pers_MEAN", "5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -# Note: Detailed interaction analyses would be added here if significant interactions are found -# For now, we'll provide a framework for the most common interactions - -print("\n=== INTERACTION EXPLORATIONS ===") -print("Note: Detailed interaction analyses will be performed for significant interactions") -print("Check the ANOVA results above to identify which interactions are significant") - -# Example framework for TIME × DOMAIN interaction (if significant) -# if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { -# print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") -# time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -# print("Estimated Marginal Means:") -# print(time_domain_emmeans) -# -# print("\nSimple Effects of DOMAIN within each TIME:") -# time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -# print(time_domain_simple) -# -# print("\nSimple Effects of TIME within each DOMAIN:") -# time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -# print(time_domain_simple2) -# } - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003150155.r b/.history/eohi2/mixed anova - domain means_20251003150155.r deleted file mode 100644 index 69ab2d6..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003150155.r +++ /dev/null @@ -1,611 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "5.10past_pref_MEAN", "5.10past_pers_MEAN", "5.10past_val_MEAN", - "5.10fut_pref_MEAN", "5.10fut_pers_MEAN", "5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -# Note: Detailed interaction analyses would be added here if significant interactions are found -# For now, we'll provide a framework for the most common interactions - -print("\n=== INTERACTION EXPLORATIONS ===") -print("Note: Detailed interaction analyses will be performed for significant interactions") -print("Check the ANOVA results above to identify which interactions are significant") - -# Example framework for TIME × DOMAIN interaction (if significant) -# if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { -# print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") -# time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -# print("Estimated Marginal Means:") -# print(time_domain_emmeans) -# -# print("\nSimple Effects of DOMAIN within each TIME:") -# time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -# print(time_domain_simple) -# -# print("\nSimple Effects of TIME within each DOMAIN:") -# time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -# print(time_domain_simple2) -# } - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003150222.r b/.history/eohi2/mixed anova - domain means_20251003150222.r deleted file mode 100644 index 1b34616..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003150222.r +++ /dev/null @@ -1,611 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -# Note: Detailed interaction analyses would be added here if significant interactions are found -# For now, we'll provide a framework for the most common interactions - -print("\n=== INTERACTION EXPLORATIONS ===") -print("Note: Detailed interaction analyses will be performed for significant interactions") -print("Check the ANOVA results above to identify which interactions are significant") - -# Example framework for TIME × DOMAIN interaction (if significant) -# if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { -# print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") -# time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -# print("Estimated Marginal Means:") -# print(time_domain_emmeans) -# -# print("\nSimple Effects of DOMAIN within each TIME:") -# time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -# print(time_domain_simple) -# -# print("\nSimple Effects of TIME within each DOMAIN:") -# time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -# print(time_domain_simple2) -# } - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003150238.r b/.history/eohi2/mixed anova - domain means_20251003150238.r deleted file mode 100644 index 1b34616..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003150238.r +++ /dev/null @@ -1,611 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -# Note: Detailed interaction analyses would be added here if significant interactions are found -# For now, we'll provide a framework for the most common interactions - -print("\n=== INTERACTION EXPLORATIONS ===") -print("Note: Detailed interaction analyses will be performed for significant interactions") -print("Check the ANOVA results above to identify which interactions are significant") - -# Example framework for TIME × DOMAIN interaction (if significant) -# if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { -# print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") -# time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -# print("Estimated Marginal Means:") -# print(time_domain_emmeans) -# -# print("\nSimple Effects of DOMAIN within each TIME:") -# time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -# print(time_domain_simple) -# -# print("\nSimple Effects of TIME within each DOMAIN:") -# time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -# print(time_domain_simple2) -# } - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003150306.r b/.history/eohi2/mixed anova - domain means_20251003150306.r deleted file mode 100644 index 1b34616..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003150306.r +++ /dev/null @@ -1,611 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -# Note: Detailed interaction analyses would be added here if significant interactions are found -# For now, we'll provide a framework for the most common interactions - -print("\n=== INTERACTION EXPLORATIONS ===") -print("Note: Detailed interaction analyses will be performed for significant interactions") -print("Check the ANOVA results above to identify which interactions are significant") - -# Example framework for TIME × DOMAIN interaction (if significant) -# if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { -# print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") -# time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) -# print("Estimated Marginal Means:") -# print(time_domain_emmeans) -# -# print("\nSimple Effects of DOMAIN within each TIME:") -# time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") -# print(time_domain_simple) -# -# print("\nSimple Effects of TIME within each DOMAIN:") -# time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") -# print(time_domain_simple2) -# } - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003152345.r b/.history/eohi2/mixed anova - domain means_20251003152345.r deleted file mode 100644 index ad91f38..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003152345.r +++ /dev/null @@ -1,748 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -print("\n=== INTERACTION EXPLORATIONS ===") - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003152354.r b/.history/eohi2/mixed anova - domain means_20251003152354.r deleted file mode 100644 index ad91f38..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003152354.r +++ /dev/null @@ -1,748 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -print("\n=== INTERACTION EXPLORATIONS ===") - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003152532.r b/.history/eohi2/mixed anova - domain means_20251003152532.r deleted file mode 100644 index ad91f38..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003152532.r +++ /dev/null @@ -1,748 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -print("\n=== INTERACTION EXPLORATIONS ===") - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251003170438.r b/.history/eohi2/mixed anova - domain means_20251003170438.r deleted file mode 100644 index ad91f38..0000000 --- a/.history/eohi2/mixed anova - domain means_20251003170438.r +++ /dev/null @@ -1,748 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -print("\n=== INTERACTION EXPLORATIONS ===") - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251006191145.r b/.history/eohi2/mixed anova - domain means_20251006191145.r deleted file mode 100644 index ad91f38..0000000 --- a/.history/eohi2/mixed anova - domain means_20251006191145.r +++ /dev/null @@ -1,748 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -print("\n=== INTERACTION EXPLORATIONS ===") - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251006191927.r b/.history/eohi2/mixed anova - domain means_20251006191927.r deleted file mode 100644 index 64ce2dc..0000000 --- a/.history/eohi2/mixed anova - domain means_20251006191927.r +++ /dev/null @@ -1,822 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -print("\n=== INTERACTION EXPLORATIONS ===") - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("\n=== CREATING 3-WAY INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251006191941.r b/.history/eohi2/mixed anova - domain means_20251006191941.r deleted file mode 100644 index 64ce2dc..0000000 --- a/.history/eohi2/mixed anova - domain means_20251006191941.r +++ /dev/null @@ -1,822 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -print("\n=== INTERACTION EXPLORATIONS ===") - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("\n=== CREATING 3-WAY INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251006191952.r b/.history/eohi2/mixed anova - domain means_20251006191952.r deleted file mode 100644 index 64ce2dc..0000000 --- a/.history/eohi2/mixed anova - domain means_20251006191952.r +++ /dev/null @@ -1,822 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -print("\n=== INTERACTION EXPLORATIONS ===") - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("\n=== CREATING 3-WAY INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.3, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251006192942.r b/.history/eohi2/mixed anova - domain means_20251006192942.r deleted file mode 100644 index 376274d..0000000 --- a/.history/eohi2/mixed anova - domain means_20251006192942.r +++ /dev/null @@ -1,822 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -print("\n=== INTERACTION EXPLORATIONS ===") - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("\n=== CREATING 3-WAY INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251006225442.r b/.history/eohi2/mixed anova - domain means_20251006225442.r deleted file mode 100644 index 376274d..0000000 --- a/.history/eohi2/mixed anova - domain means_20251006225442.r +++ /dev/null @@ -1,822 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -print("\n=== INTERACTION EXPLORATIONS ===") - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("\n=== CREATING 3-WAY INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251007105958.r b/.history/eohi2/mixed anova - domain means_20251007105958.r deleted file mode 100644 index 3521624..0000000 --- a/.history/eohi2/mixed anova - domain means_20251007105958.r +++ /dev/null @@ -1,878 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -print("\n=== INTERACTION EXPLORATIONS ===") - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("\n=== CREATING 3-WAY INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars) -# ============================================================================= - -print("\n=== CREATING MAIN-EFFECT PLOT FOR TIME ===") - -# Prepare emmeans data frame for TIME main effect -time_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with the 3-way interaction plot) -time_main_plot <- ggplot(time_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251007110011.r b/.history/eohi2/mixed anova - domain means_20251007110011.r deleted file mode 100644 index 3521624..0000000 --- a/.history/eohi2/mixed anova - domain means_20251007110011.r +++ /dev/null @@ -1,878 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -print("\n=== INTERACTION EXPLORATIONS ===") - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("\n=== CREATING 3-WAY INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars) -# ============================================================================= - -print("\n=== CREATING MAIN-EFFECT PLOT FOR TIME ===") - -# Prepare emmeans data frame for TIME main effect -time_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with the 3-way interaction plot) -time_main_plot <- ggplot(time_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251007110320.r b/.history/eohi2/mixed anova - domain means_20251007110320.r deleted file mode 100644 index 3521624..0000000 --- a/.history/eohi2/mixed anova - domain means_20251007110320.r +++ /dev/null @@ -1,878 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -print("\n=== INTERACTION EXPLORATIONS ===") - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("\n=== CREATING 3-WAY INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars) -# ============================================================================= - -print("\n=== CREATING MAIN-EFFECT PLOT FOR TIME ===") - -# Prepare emmeans data frame for TIME main effect -time_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with the 3-way interaction plot) -time_main_plot <- ggplot(time_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251007110333.r b/.history/eohi2/mixed anova - domain means_20251007110333.r deleted file mode 100644 index 049cfb4..0000000 --- a/.history/eohi2/mixed anova - domain means_20251007110333.r +++ /dev/null @@ -1,876 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - -print("Domain mapping created:") -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME, DOMAIN, and INTERVAL:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors:") -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by between-subjects factors only:") -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME, DOMAIN, and INTERVAL:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - -print("Anderson-Darling normality test results:") -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN × INTERVAL combination:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME × INTERVAL combination:") -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - -print("Homogeneity of variance across INTERVAL within each TIME × DOMAIN combination:") -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors -print("\n=== HARTLEY'S F-MAX TEST FOR BETWEEN-SUBJECTS FACTORS ===") - -# Check what values the between-subjects factors actually have -print("Unique TEMPORAL_DO values:") -print(unique(long_data_clean$TEMPORAL_DO)) -print("Unique INTERVAL_DO values:") -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination -print("\n=== HARTLEY'S F-MAX TEST: TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination ===") - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination -print("\n=== HARTLEY'S F-MAX TEST: INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination ===") - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - -print("\n=== MIXED ANOVA RESULTS (with sphericity corrections) ===") - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - -print("ANOVA Results:") -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity -print("\nMauchly's Test of Sphericity:") -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - -print("\n=== EFFECT SIZES (GENERALIZED ETA SQUARED) ===") - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) -print("Generalized Eta Squared:") -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans -print("\n=== POST-HOC COMPARISONS ===") - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME -print("Main Effect of TIME:") -time_emmeans <- emmeans(aov_model, ~ TIME) -print("Estimated Marginal Means:") -print(time_emmeans) -print("\nPairwise Contrasts:") -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN -print("\nMain Effect of DOMAIN:") -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) -print("Estimated Marginal Means:") -print(domain_emmeans) -print("\nPairwise Contrasts:") -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL -print("\nMain Effect of INTERVAL:") -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) -print("Estimated Marginal Means:") -print(interval_emmeans) -print("\nPairwise Contrasts:") -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO -print("\nMain Effect of TEMPORAL_DO:") -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO -print("\nMain Effect of INTERVAL_DO:") -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - -print("\n=== COHEN'S D FOR MAIN EFFECTS ===") - -# Main Effect of TIME (if significant) -print("\n=== COHEN'S D FOR TIME MAIN EFFECT ===") -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) -print("TIME main effect contrast:") -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) -print("\n=== COHEN'S D FOR DOMAIN MAIN EFFECT ===") -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) -print("DOMAIN main effect contrasts:") -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) -print("\n=== COHEN'S D FOR INTERVAL MAIN EFFECT ===") -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) -print("INTERVAL main effect contrasts:") -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - -print("\n=== INTERACTION EXPLORATIONS ===") - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - -print("\n=== ANALYSIS COMPLETE ===") -print("Mixed ANOVA analysis with three within-subjects factors (TIME, DOMAIN, INTERVAL)") -print("and two between-subjects factors (TEMPORAL_DO, INTERVAL_DO) completed.") -print("Check the results above for significant effects and perform additional") -print("interaction analyses as needed based on the significance patterns.") - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - -print("\n=== CREATING 3-WAY INTERACTION PLOT ===") - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with the 3-way interaction plot) -time_main_plot <- ggplot(time_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251007110641.r b/.history/eohi2/mixed anova - domain means_20251007110641.r deleted file mode 100644 index 059ac51..0000000 --- a/.history/eohi2/mixed anova - domain means_20251007110641.r +++ /dev/null @@ -1,872 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - - -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - - -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - - -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - - -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors - - -# Check what values the between-subjects factors actually have - -print(unique(long_data_clean$TEMPORAL_DO)) - -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination - - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination - - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - - - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity - -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - - - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) - -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans - - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME - -time_emmeans <- emmeans(aov_model, ~ TIME) - -print(time_emmeans) - -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN - -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) - -print(domain_emmeans) - -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL - -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) - -print(interval_emmeans) - -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO - -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO - -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - - - -# Main Effect of TIME (if significant) - -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) - -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) - -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) - -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) - -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) - -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - - - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - - - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - - - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with the 3-way interaction plot) -time_main_plot <- ggplot(time_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251007110654.r b/.history/eohi2/mixed anova - domain means_20251007110654.r deleted file mode 100644 index 059ac51..0000000 --- a/.history/eohi2/mixed anova - domain means_20251007110654.r +++ /dev/null @@ -1,872 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - - -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - - -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - - -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - - -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors - - -# Check what values the between-subjects factors actually have - -print(unique(long_data_clean$TEMPORAL_DO)) - -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination - - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination - - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - - - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity - -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - - - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) - -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans - - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME - -time_emmeans <- emmeans(aov_model, ~ TIME) - -print(time_emmeans) - -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN - -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) - -print(domain_emmeans) - -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL - -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) - -print(interval_emmeans) - -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO - -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO - -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - - - -# Main Effect of TIME (if significant) - -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) - -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) - -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) - -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) - -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) - -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - - - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - - - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - - - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with the 3-way interaction plot) -time_main_plot <- ggplot(time_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251007111158.r b/.history/eohi2/mixed anova - domain means_20251007111158.r deleted file mode 100644 index 059ac51..0000000 --- a/.history/eohi2/mixed anova - domain means_20251007111158.r +++ /dev/null @@ -1,872 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - - -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - - -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - - -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - - -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors - - -# Check what values the between-subjects factors actually have - -print(unique(long_data_clean$TEMPORAL_DO)) - -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination - - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination - - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - - - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity - -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - - - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) - -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans - - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME - -time_emmeans <- emmeans(aov_model, ~ TIME) - -print(time_emmeans) - -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN - -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) - -print(domain_emmeans) - -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL - -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) - -print(interval_emmeans) - -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO - -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO - -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - - - -# Main Effect of TIME (if significant) - -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) - -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) - -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) - -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) - -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) - -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - - - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - - - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - - - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with the 3-way interaction plot) -time_main_plot <- ggplot(time_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251007155225.r b/.history/eohi2/mixed anova - domain means_20251007155225.r deleted file mode 100644 index 059ac51..0000000 --- a/.history/eohi2/mixed anova - domain means_20251007155225.r +++ /dev/null @@ -1,872 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - - -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - - -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - - -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - - -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors - - -# Check what values the between-subjects factors actually have - -print(unique(long_data_clean$TEMPORAL_DO)) - -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination - - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination - - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - - - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity - -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - - - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) - -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans - - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME - -time_emmeans <- emmeans(aov_model, ~ TIME) - -print(time_emmeans) - -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN - -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) - -print(domain_emmeans) - -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL - -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) - -print(interval_emmeans) - -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO - -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO - -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - - - -# Main Effect of TIME (if significant) - -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) - -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) - -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) - -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) - -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) - -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - - - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - - - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - - - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with the 3-way interaction plot) -time_main_plot <- ggplot(time_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251007184341.r b/.history/eohi2/mixed anova - domain means_20251007184341.r deleted file mode 100644 index 059ac51..0000000 --- a/.history/eohi2/mixed anova - domain means_20251007184341.r +++ /dev/null @@ -1,872 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - - -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - - -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - - -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - - -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors - - -# Check what values the between-subjects factors actually have - -print(unique(long_data_clean$TEMPORAL_DO)) - -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination - - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination - - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - - - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity - -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - - - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) - -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans - - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME - -time_emmeans <- emmeans(aov_model, ~ TIME) - -print(time_emmeans) - -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN - -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) - -print(domain_emmeans) - -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL - -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) - -print(interval_emmeans) - -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO - -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO - -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - - - -# Main Effect of TIME (if significant) - -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) - -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) - -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) - -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) - -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) - -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - - - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - - - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - - - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with the 3-way interaction plot) -time_main_plot <- ggplot(time_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251007184520.r b/.history/eohi2/mixed anova - domain means_20251007184520.r deleted file mode 100644 index 455fe84..0000000 --- a/.history/eohi2/mixed anova - domain means_20251007184520.r +++ /dev/null @@ -1,941 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - - -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - - -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - - -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - - -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors - - -# Check what values the between-subjects factors actually have - -print(unique(long_data_clean$TEMPORAL_DO)) - -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination - - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination - - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - - - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity - -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - - - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) - -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans - - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME - -time_emmeans <- emmeans(aov_model, ~ TIME) - -print(time_emmeans) - -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN - -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) - -print(domain_emmeans) - -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL - -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) - -print(interval_emmeans) - -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO - -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO - -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - - - -# Main Effect of TIME (if significant) - -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) - -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) - -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) - -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) - -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) - -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - - - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - - # THREE-WAY INTERACTION: TIME × INTERVAL × TEMPORAL_DO - if("TIME:INTERVAL:TEMPORAL_DO" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL:TEMPORAL_DO"] < 0.05) { - print("\n=== TIME × INTERVAL × TEMPORAL_DO THREE-WAY INTERACTION (SIGNIFICANT) ===") - three_way_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL * TEMPORAL_DO) - print("Estimated Marginal Means:") - print(three_way_emmeans) - - print("\nPast vs Future contrasts within each INTERVAL × TEMPORAL_DO combination:") - three_way_contrasts <- pairs(three_way_emmeans, by = c("INTERVAL", "TEMPORAL_DO"), adjust = "bonferroni") - print(three_way_contrasts) - - # Calculate Cohen's d for significant Past vs Future contrasts - three_way_contrasts_df <- as.data.frame(three_way_contrasts) - significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - - if(nrow(significant_three_way) > 0) { - print("\n=== COHEN'S D FOR SIGNIFICANT TIME CONTRASTS IN THREE-WAY INTERACTION ===") - print("Significant Past vs Future contrasts (p < 0.05):") - print(significant_three_way) - - print("\nCohen's d calculations for significant Past vs Future contrasts:") - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - interval_level <- as.character(comparison$INTERVAL) - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - - # Get data for Past and Future within this INTERVAL × TEMPORAL_DO combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("INTERVAL = %s, TEMPORAL_DO = %s:\n", interval_level, temporal_do_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } - } else { - cat("No significant Past vs Future contrasts found within any INTERVAL × TEMPORAL_DO combination.\n") - } - } else { - print("\n=== TIME × INTERVAL × TEMPORAL_DO THREE-WAY INTERACTION ===") - if("TIME:INTERVAL:TEMPORAL_DO" %in% anova_output$Effect) { - p_value <- anova_output$p[anova_output$Effect == "TIME:INTERVAL:TEMPORAL_DO"] - print(sprintf("Three-way interaction not significant: p = %.6f", p_value)) - } else { - print("Three-way interaction not found in ANOVA results") - } - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - - - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - - - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with the 3-way interaction plot) -time_main_plot <- ggplot(time_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251007184536.r b/.history/eohi2/mixed anova - domain means_20251007184536.r deleted file mode 100644 index 455fe84..0000000 --- a/.history/eohi2/mixed anova - domain means_20251007184536.r +++ /dev/null @@ -1,941 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - - -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - - -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - - -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - - -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors - - -# Check what values the between-subjects factors actually have - -print(unique(long_data_clean$TEMPORAL_DO)) - -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination - - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination - - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - - - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity - -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - - - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) - -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans - - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME - -time_emmeans <- emmeans(aov_model, ~ TIME) - -print(time_emmeans) - -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN - -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) - -print(domain_emmeans) - -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL - -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) - -print(interval_emmeans) - -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO - -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO - -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - - - -# Main Effect of TIME (if significant) - -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) - -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) - -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) - -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) - -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) - -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - - - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - - # THREE-WAY INTERACTION: TIME × INTERVAL × TEMPORAL_DO - if("TIME:INTERVAL:TEMPORAL_DO" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL:TEMPORAL_DO"] < 0.05) { - print("\n=== TIME × INTERVAL × TEMPORAL_DO THREE-WAY INTERACTION (SIGNIFICANT) ===") - three_way_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL * TEMPORAL_DO) - print("Estimated Marginal Means:") - print(three_way_emmeans) - - print("\nPast vs Future contrasts within each INTERVAL × TEMPORAL_DO combination:") - three_way_contrasts <- pairs(three_way_emmeans, by = c("INTERVAL", "TEMPORAL_DO"), adjust = "bonferroni") - print(three_way_contrasts) - - # Calculate Cohen's d for significant Past vs Future contrasts - three_way_contrasts_df <- as.data.frame(three_way_contrasts) - significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - - if(nrow(significant_three_way) > 0) { - print("\n=== COHEN'S D FOR SIGNIFICANT TIME CONTRASTS IN THREE-WAY INTERACTION ===") - print("Significant Past vs Future contrasts (p < 0.05):") - print(significant_three_way) - - print("\nCohen's d calculations for significant Past vs Future contrasts:") - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - interval_level <- as.character(comparison$INTERVAL) - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - - # Get data for Past and Future within this INTERVAL × TEMPORAL_DO combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("INTERVAL = %s, TEMPORAL_DO = %s:\n", interval_level, temporal_do_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } - } else { - cat("No significant Past vs Future contrasts found within any INTERVAL × TEMPORAL_DO combination.\n") - } - } else { - print("\n=== TIME × INTERVAL × TEMPORAL_DO THREE-WAY INTERACTION ===") - if("TIME:INTERVAL:TEMPORAL_DO" %in% anova_output$Effect) { - p_value <- anova_output$p[anova_output$Effect == "TIME:INTERVAL:TEMPORAL_DO"] - print(sprintf("Three-way interaction not significant: p = %.6f", p_value)) - } else { - print("Three-way interaction not found in ANOVA results") - } - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - - - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - - - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with the 3-way interaction plot) -time_main_plot <- ggplot(time_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251007184540.r b/.history/eohi2/mixed anova - domain means_20251007184540.r deleted file mode 100644 index 455fe84..0000000 --- a/.history/eohi2/mixed anova - domain means_20251007184540.r +++ /dev/null @@ -1,941 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - - -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - - -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - - -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - - -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors - - -# Check what values the between-subjects factors actually have - -print(unique(long_data_clean$TEMPORAL_DO)) - -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination - - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination - - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - - - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity - -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - - - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) - -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans - - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME - -time_emmeans <- emmeans(aov_model, ~ TIME) - -print(time_emmeans) - -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN - -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) - -print(domain_emmeans) - -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL - -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) - -print(interval_emmeans) - -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO - -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO - -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - - - -# Main Effect of TIME (if significant) - -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) - -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) - -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) - -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) - -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) - -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - - - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - - # THREE-WAY INTERACTION: TIME × INTERVAL × TEMPORAL_DO - if("TIME:INTERVAL:TEMPORAL_DO" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL:TEMPORAL_DO"] < 0.05) { - print("\n=== TIME × INTERVAL × TEMPORAL_DO THREE-WAY INTERACTION (SIGNIFICANT) ===") - three_way_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL * TEMPORAL_DO) - print("Estimated Marginal Means:") - print(three_way_emmeans) - - print("\nPast vs Future contrasts within each INTERVAL × TEMPORAL_DO combination:") - three_way_contrasts <- pairs(three_way_emmeans, by = c("INTERVAL", "TEMPORAL_DO"), adjust = "bonferroni") - print(three_way_contrasts) - - # Calculate Cohen's d for significant Past vs Future contrasts - three_way_contrasts_df <- as.data.frame(three_way_contrasts) - significant_three_way <- three_way_contrasts_df[three_way_contrasts_df$p.value < 0.05, ] - - if(nrow(significant_three_way) > 0) { - print("\n=== COHEN'S D FOR SIGNIFICANT TIME CONTRASTS IN THREE-WAY INTERACTION ===") - print("Significant Past vs Future contrasts (p < 0.05):") - print(significant_three_way) - - print("\nCohen's d calculations for significant Past vs Future contrasts:") - - for(i in seq_len(nrow(significant_three_way))) { - comparison <- significant_three_way[i, ] - - # Extract the grouping variables - interval_level <- as.character(comparison$INTERVAL) - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - - # Get data for Past and Future within this INTERVAL × TEMPORAL_DO combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("INTERVAL = %s, TEMPORAL_DO = %s:\n", interval_level, temporal_do_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } - } else { - cat("No significant Past vs Future contrasts found within any INTERVAL × TEMPORAL_DO combination.\n") - } - } else { - print("\n=== TIME × INTERVAL × TEMPORAL_DO THREE-WAY INTERACTION ===") - if("TIME:INTERVAL:TEMPORAL_DO" %in% anova_output$Effect) { - p_value <- anova_output$p[anova_output$Effect == "TIME:INTERVAL:TEMPORAL_DO"] - print(sprintf("Three-way interaction not significant: p = %.6f", p_value)) - } else { - print("Three-way interaction not found in ANOVA results") - } - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - - - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - - - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with the 3-way interaction plot) -time_main_plot <- ggplot(time_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251007184820.r b/.history/eohi2/mixed anova - domain means_20251007184820.r deleted file mode 100644 index d25c210..0000000 --- a/.history/eohi2/mixed anova - domain means_20251007184820.r +++ /dev/null @@ -1,923 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - - -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - - -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - - -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - - -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors - - -# Check what values the between-subjects factors actually have - -print(unique(long_data_clean$TEMPORAL_DO)) - -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination - - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination - - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - - - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity - -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - - - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) - -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans - - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME - -time_emmeans <- emmeans(aov_model, ~ TIME) - -print(time_emmeans) - -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN - -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) - -print(domain_emmeans) - -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL - -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) - -print(interval_emmeans) - -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO - -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO - -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - - - -# Main Effect of TIME (if significant) - -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) - -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) - -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) - -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) - -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) - -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - - - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - - # THREE-WAY INTERACTION: TIME × INTERVAL × TEMPORAL_DO - print("\n=== TIME × INTERVAL × TEMPORAL_DO THREE-WAY INTERACTION ===") - three_way_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL * TEMPORAL_DO) - print("Estimated Marginal Means:") - print(three_way_emmeans) - - print("\nPast vs Future contrasts within each INTERVAL × TEMPORAL_DO combination:") - three_way_contrasts <- pairs(three_way_emmeans, by = c("INTERVAL", "TEMPORAL_DO"), adjust = "bonferroni") - print(three_way_contrasts) - - # Calculate Cohen's d for Past vs Future contrasts - three_way_contrasts_df <- as.data.frame(three_way_contrasts) - - print("\n=== COHEN'S D FOR TIME CONTRASTS IN THREE-WAY INTERACTION ===") - print("Past vs Future contrasts for all INTERVAL × TEMPORAL_DO combinations:") - - for(i in seq_len(nrow(three_way_contrasts_df))) { - comparison <- three_way_contrasts_df[i, ] - - # Extract the grouping variables - interval_level <- as.character(comparison$INTERVAL) - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - - # Get data for Past and Future within this INTERVAL × TEMPORAL_DO combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("INTERVAL = %s, TEMPORAL_DO = %s:\n", interval_level, temporal_do_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - - - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - - - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with the 3-way interaction plot) -time_main_plot <- ggplot(time_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251007184823.r b/.history/eohi2/mixed anova - domain means_20251007184823.r deleted file mode 100644 index d25c210..0000000 --- a/.history/eohi2/mixed anova - domain means_20251007184823.r +++ /dev/null @@ -1,923 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - - -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - - -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - - -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - - -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors - - -# Check what values the between-subjects factors actually have - -print(unique(long_data_clean$TEMPORAL_DO)) - -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination - - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination - - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - - - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity - -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - - - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) - -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans - - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME - -time_emmeans <- emmeans(aov_model, ~ TIME) - -print(time_emmeans) - -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN - -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) - -print(domain_emmeans) - -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL - -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) - -print(interval_emmeans) - -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO - -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO - -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - - - -# Main Effect of TIME (if significant) - -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) - -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) - -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) - -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) - -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) - -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - - - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - - # THREE-WAY INTERACTION: TIME × INTERVAL × TEMPORAL_DO - print("\n=== TIME × INTERVAL × TEMPORAL_DO THREE-WAY INTERACTION ===") - three_way_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL * TEMPORAL_DO) - print("Estimated Marginal Means:") - print(three_way_emmeans) - - print("\nPast vs Future contrasts within each INTERVAL × TEMPORAL_DO combination:") - three_way_contrasts <- pairs(three_way_emmeans, by = c("INTERVAL", "TEMPORAL_DO"), adjust = "bonferroni") - print(three_way_contrasts) - - # Calculate Cohen's d for Past vs Future contrasts - three_way_contrasts_df <- as.data.frame(three_way_contrasts) - - print("\n=== COHEN'S D FOR TIME CONTRASTS IN THREE-WAY INTERACTION ===") - print("Past vs Future contrasts for all INTERVAL × TEMPORAL_DO combinations:") - - for(i in seq_len(nrow(three_way_contrasts_df))) { - comparison <- three_way_contrasts_df[i, ] - - # Extract the grouping variables - interval_level <- as.character(comparison$INTERVAL) - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - - # Get data for Past and Future within this INTERVAL × TEMPORAL_DO combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("INTERVAL = %s, TEMPORAL_DO = %s:\n", interval_level, temporal_do_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - - - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - - - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with the 3-way interaction plot) -time_main_plot <- ggplot(time_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251007184824.r b/.history/eohi2/mixed anova - domain means_20251007184824.r deleted file mode 100644 index d25c210..0000000 --- a/.history/eohi2/mixed anova - domain means_20251007184824.r +++ /dev/null @@ -1,923 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - - -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - - -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - - -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - - -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors - - -# Check what values the between-subjects factors actually have - -print(unique(long_data_clean$TEMPORAL_DO)) - -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination - - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination - - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - - - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity - -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - - - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) - -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans - - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME - -time_emmeans <- emmeans(aov_model, ~ TIME) - -print(time_emmeans) - -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN - -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) - -print(domain_emmeans) - -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL - -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) - -print(interval_emmeans) - -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO - -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO - -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - - - -# Main Effect of TIME (if significant) - -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) - -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) - -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) - -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) - -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) - -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - - - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - - # THREE-WAY INTERACTION: TIME × INTERVAL × TEMPORAL_DO - print("\n=== TIME × INTERVAL × TEMPORAL_DO THREE-WAY INTERACTION ===") - three_way_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL * TEMPORAL_DO) - print("Estimated Marginal Means:") - print(three_way_emmeans) - - print("\nPast vs Future contrasts within each INTERVAL × TEMPORAL_DO combination:") - three_way_contrasts <- pairs(three_way_emmeans, by = c("INTERVAL", "TEMPORAL_DO"), adjust = "bonferroni") - print(three_way_contrasts) - - # Calculate Cohen's d for Past vs Future contrasts - three_way_contrasts_df <- as.data.frame(three_way_contrasts) - - print("\n=== COHEN'S D FOR TIME CONTRASTS IN THREE-WAY INTERACTION ===") - print("Past vs Future contrasts for all INTERVAL × TEMPORAL_DO combinations:") - - for(i in seq_len(nrow(three_way_contrasts_df))) { - comparison <- three_way_contrasts_df[i, ] - - # Extract the grouping variables - interval_level <- as.character(comparison$INTERVAL) - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - - # Get data for Past and Future within this INTERVAL × TEMPORAL_DO combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("INTERVAL = %s, TEMPORAL_DO = %s:\n", interval_level, temporal_do_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - - - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - - - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with the 3-way interaction plot) -time_main_plot <- ggplot(time_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251007185023.r b/.history/eohi2/mixed anova - domain means_20251007185023.r deleted file mode 100644 index d25c210..0000000 --- a/.history/eohi2/mixed anova - domain means_20251007185023.r +++ /dev/null @@ -1,923 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - - -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - - -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - - -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - - -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors - - -# Check what values the between-subjects factors actually have - -print(unique(long_data_clean$TEMPORAL_DO)) - -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination - - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination - - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - - - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity - -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - - - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) - -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans - - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME - -time_emmeans <- emmeans(aov_model, ~ TIME) - -print(time_emmeans) - -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN - -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) - -print(domain_emmeans) - -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL - -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) - -print(interval_emmeans) - -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO - -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO - -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - - - -# Main Effect of TIME (if significant) - -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) - -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) - -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) - -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) - -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) - -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - - - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - - # THREE-WAY INTERACTION: TIME × INTERVAL × TEMPORAL_DO - print("\n=== TIME × INTERVAL × TEMPORAL_DO THREE-WAY INTERACTION ===") - three_way_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL * TEMPORAL_DO) - print("Estimated Marginal Means:") - print(three_way_emmeans) - - print("\nPast vs Future contrasts within each INTERVAL × TEMPORAL_DO combination:") - three_way_contrasts <- pairs(three_way_emmeans, by = c("INTERVAL", "TEMPORAL_DO"), adjust = "bonferroni") - print(three_way_contrasts) - - # Calculate Cohen's d for Past vs Future contrasts - three_way_contrasts_df <- as.data.frame(three_way_contrasts) - - print("\n=== COHEN'S D FOR TIME CONTRASTS IN THREE-WAY INTERACTION ===") - print("Past vs Future contrasts for all INTERVAL × TEMPORAL_DO combinations:") - - for(i in seq_len(nrow(three_way_contrasts_df))) { - comparison <- three_way_contrasts_df[i, ] - - # Extract the grouping variables - interval_level <- as.character(comparison$INTERVAL) - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - - # Get data for Past and Future within this INTERVAL × TEMPORAL_DO combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("INTERVAL = %s, TEMPORAL_DO = %s:\n", interval_level, temporal_do_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - - - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - - - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with the 3-way interaction plot) -time_main_plot <- ggplot(time_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251007185544.r b/.history/eohi2/mixed anova - domain means_20251007185544.r deleted file mode 100644 index d25c210..0000000 --- a/.history/eohi2/mixed anova - domain means_20251007185544.r +++ /dev/null @@ -1,923 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - - -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - - -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - - -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - - -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors - - -# Check what values the between-subjects factors actually have - -print(unique(long_data_clean$TEMPORAL_DO)) - -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination - - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination - - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - - - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity - -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - - - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) - -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans - - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME - -time_emmeans <- emmeans(aov_model, ~ TIME) - -print(time_emmeans) - -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN - -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) - -print(domain_emmeans) - -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL - -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) - -print(interval_emmeans) - -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO - -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO - -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - - - -# Main Effect of TIME (if significant) - -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) - -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) - -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) - -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) - -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) - -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - - - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - - # THREE-WAY INTERACTION: TIME × INTERVAL × TEMPORAL_DO - print("\n=== TIME × INTERVAL × TEMPORAL_DO THREE-WAY INTERACTION ===") - three_way_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL * TEMPORAL_DO) - print("Estimated Marginal Means:") - print(three_way_emmeans) - - print("\nPast vs Future contrasts within each INTERVAL × TEMPORAL_DO combination:") - three_way_contrasts <- pairs(three_way_emmeans, by = c("INTERVAL", "TEMPORAL_DO"), adjust = "bonferroni") - print(three_way_contrasts) - - # Calculate Cohen's d for Past vs Future contrasts - three_way_contrasts_df <- as.data.frame(three_way_contrasts) - - print("\n=== COHEN'S D FOR TIME CONTRASTS IN THREE-WAY INTERACTION ===") - print("Past vs Future contrasts for all INTERVAL × TEMPORAL_DO combinations:") - - for(i in seq_len(nrow(three_way_contrasts_df))) { - comparison <- three_way_contrasts_df[i, ] - - # Extract the grouping variables - interval_level <- as.character(comparison$INTERVAL) - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - - # Get data for Past and Future within this INTERVAL × TEMPORAL_DO combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("INTERVAL = %s, TEMPORAL_DO = %s:\n", interval_level, temporal_do_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - - - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - - - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with the 3-way interaction plot) -time_main_plot <- ggplot(time_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251008192902.r b/.history/eohi2/mixed anova - domain means_20251008192902.r deleted file mode 100644 index d25c210..0000000 --- a/.history/eohi2/mixed anova - domain means_20251008192902.r +++ /dev/null @@ -1,923 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - - -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - - -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - - -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - - -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors - - -# Check what values the between-subjects factors actually have - -print(unique(long_data_clean$TEMPORAL_DO)) - -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination - - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination - - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - - - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity - -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - - - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) - -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans - - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME - -time_emmeans <- emmeans(aov_model, ~ TIME) - -print(time_emmeans) - -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN - -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) - -print(domain_emmeans) - -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL - -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) - -print(interval_emmeans) - -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO - -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO - -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - - - -# Main Effect of TIME (if significant) - -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) - -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) - -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) - -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) - -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) - -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - - - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - - # THREE-WAY INTERACTION: TIME × INTERVAL × TEMPORAL_DO - print("\n=== TIME × INTERVAL × TEMPORAL_DO THREE-WAY INTERACTION ===") - three_way_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL * TEMPORAL_DO) - print("Estimated Marginal Means:") - print(three_way_emmeans) - - print("\nPast vs Future contrasts within each INTERVAL × TEMPORAL_DO combination:") - three_way_contrasts <- pairs(three_way_emmeans, by = c("INTERVAL", "TEMPORAL_DO"), adjust = "bonferroni") - print(three_way_contrasts) - - # Calculate Cohen's d for Past vs Future contrasts - three_way_contrasts_df <- as.data.frame(three_way_contrasts) - - print("\n=== COHEN'S D FOR TIME CONTRASTS IN THREE-WAY INTERACTION ===") - print("Past vs Future contrasts for all INTERVAL × TEMPORAL_DO combinations:") - - for(i in seq_len(nrow(three_way_contrasts_df))) { - comparison <- three_way_contrasts_df[i, ] - - # Extract the grouping variables - interval_level <- as.character(comparison$INTERVAL) - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - - # Get data for Past and Future within this INTERVAL × TEMPORAL_DO combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("INTERVAL = %s, TEMPORAL_DO = %s:\n", interval_level, temporal_do_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - - - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - - - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1) + - labs( - x = "Order", - y = "Mean absolute difference from present", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with the 3-way interaction plot) -time_main_plot <- ggplot(time_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251008192910.r b/.history/eohi2/mixed anova - domain means_20251008192910.r deleted file mode 100644 index 91f7c4a..0000000 --- a/.history/eohi2/mixed anova - domain means_20251008192910.r +++ /dev/null @@ -1,927 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - - -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - - -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - - -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - - -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors - - -# Check what values the between-subjects factors actually have - -print(unique(long_data_clean$TEMPORAL_DO)) - -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination - - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination - - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - - - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity - -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - - - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) - -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans - - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME - -time_emmeans <- emmeans(aov_model, ~ TIME) - -print(time_emmeans) - -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN - -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) - -print(domain_emmeans) - -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL - -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) - -print(interval_emmeans) - -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO - -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO - -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - - - -# Main Effect of TIME (if significant) - -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) - -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) - -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) - -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) - -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) - -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - - - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - - # THREE-WAY INTERACTION: TIME × INTERVAL × TEMPORAL_DO - print("\n=== TIME × INTERVAL × TEMPORAL_DO THREE-WAY INTERACTION ===") - three_way_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL * TEMPORAL_DO) - print("Estimated Marginal Means:") - print(three_way_emmeans) - - print("\nPast vs Future contrasts within each INTERVAL × TEMPORAL_DO combination:") - three_way_contrasts <- pairs(three_way_emmeans, by = c("INTERVAL", "TEMPORAL_DO"), adjust = "bonferroni") - print(three_way_contrasts) - - # Calculate Cohen's d for Past vs Future contrasts - three_way_contrasts_df <- as.data.frame(three_way_contrasts) - - print("\n=== COHEN'S D FOR TIME CONTRASTS IN THREE-WAY INTERACTION ===") - print("Past vs Future contrasts for all INTERVAL × TEMPORAL_DO combinations:") - - for(i in seq_len(nrow(three_way_contrasts_df))) { - comparison <- three_way_contrasts_df[i, ] - - # Extract the grouping variables - interval_level <- as.character(comparison$INTERVAL) - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - - # Get data for Past and Future within this INTERVAL × TEMPORAL_DO combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("INTERVAL = %s, TEMPORAL_DO = %s:\n", interval_level, temporal_do_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - - - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - - - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1, labeller = labeller(INTERVAL = c( - "5" = "Present v. 5 Years", - "10" = "Present v. 10 Years", - "5_10" = "5 Years v. 10 Years" - ))) + - labs( - x = "Order", - y = "Mean absolute deviation", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with the 3-way interaction plot) -time_main_plot <- ggplot(time_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251008192916.r b/.history/eohi2/mixed anova - domain means_20251008192916.r deleted file mode 100644 index 91f7c4a..0000000 --- a/.history/eohi2/mixed anova - domain means_20251008192916.r +++ /dev/null @@ -1,927 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - - -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - - -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - - -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - - -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors - - -# Check what values the between-subjects factors actually have - -print(unique(long_data_clean$TEMPORAL_DO)) - -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination - - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination - - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - - - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity - -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - - - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) - -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans - - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME - -time_emmeans <- emmeans(aov_model, ~ TIME) - -print(time_emmeans) - -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN - -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) - -print(domain_emmeans) - -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL - -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) - -print(interval_emmeans) - -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO - -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO - -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - - - -# Main Effect of TIME (if significant) - -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) - -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) - -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) - -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) - -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) - -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - - - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - - # THREE-WAY INTERACTION: TIME × INTERVAL × TEMPORAL_DO - print("\n=== TIME × INTERVAL × TEMPORAL_DO THREE-WAY INTERACTION ===") - three_way_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL * TEMPORAL_DO) - print("Estimated Marginal Means:") - print(three_way_emmeans) - - print("\nPast vs Future contrasts within each INTERVAL × TEMPORAL_DO combination:") - three_way_contrasts <- pairs(three_way_emmeans, by = c("INTERVAL", "TEMPORAL_DO"), adjust = "bonferroni") - print(three_way_contrasts) - - # Calculate Cohen's d for Past vs Future contrasts - three_way_contrasts_df <- as.data.frame(three_way_contrasts) - - print("\n=== COHEN'S D FOR TIME CONTRASTS IN THREE-WAY INTERACTION ===") - print("Past vs Future contrasts for all INTERVAL × TEMPORAL_DO combinations:") - - for(i in seq_len(nrow(three_way_contrasts_df))) { - comparison <- three_way_contrasts_df[i, ] - - # Extract the grouping variables - interval_level <- as.character(comparison$INTERVAL) - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - - # Get data for Past and Future within this INTERVAL × TEMPORAL_DO combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("INTERVAL = %s, TEMPORAL_DO = %s:\n", interval_level, temporal_do_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - - - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - - - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1, labeller = labeller(INTERVAL = c( - "5" = "Present v. 5 Years", - "10" = "Present v. 10 Years", - "5_10" = "5 Years v. 10 Years" - ))) + - labs( - x = "Order", - y = "Mean absolute deviation", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with the 3-way interaction plot) -time_main_plot <- ggplot(time_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251008192921.r b/.history/eohi2/mixed anova - domain means_20251008192921.r deleted file mode 100644 index 91f7c4a..0000000 --- a/.history/eohi2/mixed anova - domain means_20251008192921.r +++ /dev/null @@ -1,927 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - - -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - - -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - - -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - - -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors - - -# Check what values the between-subjects factors actually have - -print(unique(long_data_clean$TEMPORAL_DO)) - -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination - - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination - - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - - - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity - -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - - - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) - -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans - - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME - -time_emmeans <- emmeans(aov_model, ~ TIME) - -print(time_emmeans) - -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN - -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) - -print(domain_emmeans) - -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL - -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) - -print(interval_emmeans) - -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO - -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO - -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - - - -# Main Effect of TIME (if significant) - -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) - -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) - -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) - -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) - -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) - -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - - - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - - # THREE-WAY INTERACTION: TIME × INTERVAL × TEMPORAL_DO - print("\n=== TIME × INTERVAL × TEMPORAL_DO THREE-WAY INTERACTION ===") - three_way_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL * TEMPORAL_DO) - print("Estimated Marginal Means:") - print(three_way_emmeans) - - print("\nPast vs Future contrasts within each INTERVAL × TEMPORAL_DO combination:") - three_way_contrasts <- pairs(three_way_emmeans, by = c("INTERVAL", "TEMPORAL_DO"), adjust = "bonferroni") - print(three_way_contrasts) - - # Calculate Cohen's d for Past vs Future contrasts - three_way_contrasts_df <- as.data.frame(three_way_contrasts) - - print("\n=== COHEN'S D FOR TIME CONTRASTS IN THREE-WAY INTERACTION ===") - print("Past vs Future contrasts for all INTERVAL × TEMPORAL_DO combinations:") - - for(i in seq_len(nrow(three_way_contrasts_df))) { - comparison <- three_way_contrasts_df[i, ] - - # Extract the grouping variables - interval_level <- as.character(comparison$INTERVAL) - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - - # Get data for Past and Future within this INTERVAL × TEMPORAL_DO combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("INTERVAL = %s, TEMPORAL_DO = %s:\n", interval_level, temporal_do_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - - - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - - - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1, labeller = labeller(INTERVAL = c( - "5" = "Present v. 5 Years", - "10" = "Present v. 10 Years", - "5_10" = "5 Years v. 10 Years" - ))) + - labs( - x = "Order", - y = "Mean absolute deviation", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with the 3-way interaction plot) -time_main_plot <- ggplot(time_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi2/mixed anova - domain means_20251008192926.r b/.history/eohi2/mixed anova - domain means_20251008192926.r deleted file mode 100644 index 91f7c4a..0000000 --- a/.history/eohi2/mixed anova - domain means_20251008192926.r +++ /dev/null @@ -1,927 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means - EOHI2 -# EOHI Experiment Data Analysis - Domain Level Analysis with INTERVAL factor -# Variables: NPast_5_pref_MEAN, NPast_5_pers_MEAN, NPast_5_val_MEAN, etc. -# NFut_5_pref_MEAN, NFut_5_pers_MEAN, NFut_5_val_MEAN, etc. -# NPast_10_pref_MEAN, NPast_10_pers_MEAN, NPast_10_val_MEAN, etc. -# NFut_10_pref_MEAN, NFut_10_pers_MEAN, NFut_10_val_MEAN, etc. -# 5.10past_pref_MEAN, 5.10past_pers_MEAN, 5.10past_val_MEAN -# 5.10fut_pref_MEAN, 5.10fut_pers_MEAN, 5.10fut_val_MEAN - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(afex) # For aov_ez (cleaner ANOVA output) -library(nortest) # For normality tests -library(emmeans) # For post-hoc comparisons -library(purrr) # For map functions -library(effsize) # For Cohen's d calculations -library(effectsize) # For effect size calculations - -# Global options to remove scientific notation -options(scipen = 999) - -# Set contrasts to sum for mixed ANOVA (necessary for proper interpretation) -options(contrasts = c("contr.sum", "contr.poly")) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -data <- read.csv("eohi2.csv") - -# Display basic information about the dataset -print(paste("Dataset dimensions:", paste(dim(data), collapse = " x"))) -print(paste("Number of participants:", length(unique(data$ResponseId)))) - -# Verify the specific variables we need -required_vars <- c("NPast_5_pref_MEAN", "NPast_5_pers_MEAN", "NPast_5_val_MEAN", - "NPast_10_pref_MEAN", "NPast_10_pers_MEAN", "NPast_10_val_MEAN", - "NFut_5_pref_MEAN", "NFut_5_pers_MEAN", "NFut_5_val_MEAN", - "NFut_10_pref_MEAN", "NFut_10_pers_MEAN", "NFut_10_val_MEAN", - "X5.10past_pref_MEAN", "X5.10past_pers_MEAN", "X5.10past_val_MEAN", - "X5.10fut_pref_MEAN", "X5.10fut_pers_MEAN", "X5.10fut_val_MEAN") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping with TIME, DOMAIN, and INTERVAL factors -domain_mapping <- data.frame( - variable = required_vars, - time = c(rep("Past", 3), rep("Past", 3), rep("Future", 3), rep("Future", 3), - rep("Past", 3), rep("Future", 3)), - domain = rep(c("Preferences", "Personality", "Values"), 6), - interval = c(rep("5", 3), rep("10", 3), rep("5", 3), rep("10", 3), - rep("5_10", 3), rep("5_10", 3)), - stringsAsFactors = FALSE -) - - -print(domain_mapping) - -# Efficient data pivoting using pivot_longer -long_data <- data %>% - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, all_of(required_vars)) %>% - pivot_longer( - cols = all_of(required_vars), - names_to = "variable", - values_to = "MEAN_DIFFERENCE" - ) %>% - left_join(domain_mapping, by = "variable") %>% - # Convert to factors with proper levels - mutate( - TIME = factor(time, levels = c("Past", "Future")), - DOMAIN = factor(domain, levels = c("Preferences", "Personality", "Values")), - INTERVAL = factor(interval, levels = c("5", "10", "5_10")), - ResponseId = as.factor(ResponseId), - TEMPORAL_DO = as.factor(TEMPORAL_DO), - INTERVAL_DO = as.factor(INTERVAL_DO) - ) %>% - # Select final columns and remove any rows with missing values - select(ResponseId, TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL, MEAN_DIFFERENCE) %>% - filter(!is.na(MEAN_DIFFERENCE)) - -print(paste("Long data dimensions:", paste(dim(long_data), collapse = " x"))) -print(paste("Number of participants:", length(unique(long_data$ResponseId)))) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME, DOMAIN, and INTERVAL -desc_stats <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_between <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO, TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_by_between) - -# Summary by between-subjects factors only -desc_stats_between_only <- long_data %>% - group_by(TEMPORAL_DO, INTERVAL_DO) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - - -print(desc_stats_between_only) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", paste(dim(long_data_clean), collapse = " x"))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - - -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - - -print(outlier_summary) - -# 3. Anderson-Darling normality test -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - n = n(), - ad_statistic = ad.test(.data$MEAN_DIFFERENCE)$statistic, - ad_p_value = ad.test(.data$MEAN_DIFFERENCE)$p.value, - .groups = 'drop' - ) - - -# Round only the numeric columns -normality_results_rounded <- normality_results %>% - mutate(across(where(is.numeric), ~ round(.x, 5))) -print(normality_results_rounded) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN × INTERVAL combination -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ TIME)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME × INTERVAL combination -homogeneity_domain <- long_data_clean %>% - group_by(TIME, INTERVAL) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_domain) - -# Test homogeneity across INTERVAL within each TIME × DOMAIN combination -homogeneity_interval <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - levene_F = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`F value`[1], - levene_p = leveneTest(MEAN_DIFFERENCE ~ INTERVAL)$`Pr(>F)`[1], - .groups = 'drop' - ) - - -print(homogeneity_interval) - -# 5. Hartley's F-max test for between-subjects factors - - -# Check what values the between-subjects factors actually have - -print(unique(long_data_clean$TEMPORAL_DO)) - -print(unique(long_data_clean$INTERVAL_DO)) - -# Function to calculate Hartley's F-max ratio -calculate_hartley_ratio <- function(variances) { - max(variances, na.rm = TRUE) / min(variances, na.rm = TRUE) -} - -# Hartley's F-max test across TEMPORAL_DO within each TIME × DOMAIN × INTERVAL combination - - -observed_temporal_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, INTERVAL) %>% - summarise( - # Calculate variances for each TEMPORAL_DO level within this combination - past_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "01PAST"], na.rm = TRUE), - fut_var = var(MEAN_DIFFERENCE[TEMPORAL_DO == "02FUT"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(past_var, fut_var) / min(past_var, fut_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, INTERVAL, past_var, fut_var, f_max_ratio) - -print(observed_temporal_ratios) - -# Hartley's F-max test across INTERVAL_DO within each TIME × DOMAIN × TEMPORAL_DO combination - - -observed_interval_ratios <- long_data_clean %>% - group_by(TIME, DOMAIN, TEMPORAL_DO) %>% - summarise( - # Calculate variances for each INTERVAL_DO level within this combination - int5_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "5"], na.rm = TRUE), - int10_var = var(MEAN_DIFFERENCE[INTERVAL_DO == "10"], na.rm = TRUE), - # Calculate F-max ratio - f_max_ratio = max(int5_var, int10_var) / min(int5_var, int10_var), - .groups = 'drop' - ) %>% - select(TIME, DOMAIN, TEMPORAL_DO, int5_var, int10_var, f_max_ratio) - -print(observed_interval_ratios) - -# ============================================================================= -# MIXED ANOVA ANALYSIS -# ============================================================================= - -# Check data dimensions and structure -print(paste("Data size for ANOVA:", nrow(long_data_clean), "rows")) -print(paste("Number of participants:", length(unique(long_data_clean$ResponseId)))) -print(paste("Design factors: TIME (", length(levels(long_data_clean$TIME)), "), DOMAIN (", - length(levels(long_data_clean$DOMAIN)), "), INTERVAL (", - length(levels(long_data_clean$INTERVAL)), "), TEMPORAL_DO (", - length(levels(long_data_clean$TEMPORAL_DO)), "), INTERVAL_DO (", - length(levels(long_data_clean$INTERVAL_DO)), ")", sep = "")) - -# Check for complete cases -complete_cases <- sum(complete.cases(long_data_clean)) -print(paste("Complete cases:", complete_cases, "out of", nrow(long_data_clean))) - -# Check if design is balanced -design_balance <- table(long_data_clean$ResponseId, long_data_clean$TIME, long_data_clean$DOMAIN, long_data_clean$INTERVAL) -if(all(design_balance %in% c(0, 1))) { - print("Design is balanced: each participant has data for all TIME × DOMAIN × INTERVAL combinations") -} else { - print("Warning: Design is unbalanced") - print(summary(as.vector(design_balance))) -} - -# ============================================================================= -# MIXED ANOVA WITH SPHERICITY CORRECTIONS -# ============================================================================= - - - -# Mixed ANOVA using ezANOVA with automatic sphericity corrections -# Between-subjects: TEMPORAL_DO (2 levels: 01PAST, 02FUT) × INTERVAL_DO (2 levels: 5, 10) -# Within-subjects: TIME (2 levels: Past, Future) × DOMAIN (3 levels: Preferences, Personality, Values) × INTERVAL (3 levels: 5, 10, 5_10) - -mixed_anova_model <- ezANOVA(data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = ResponseId, - between = .(TEMPORAL_DO, INTERVAL_DO), - within = .(TIME, DOMAIN, INTERVAL), - type = 3, - detailed = TRUE) - - -anova_output <- mixed_anova_model$ANOVA -rownames(anova_output) <- NULL # Reset row numbers to be sequential -print(anova_output) - -# Show Mauchly's test for sphericity - -print(mixed_anova_model$Mauchly) - -# Show sphericity-corrected results (Greenhouse-Geisser and Huynh-Feldt) -if(!is.null(mixed_anova_model$`Sphericity Corrections`)) { - print("\nGreenhouse-Geisser and Huynh-Feldt Corrections:") - print(mixed_anova_model$`Sphericity Corrections`) - - # Extract and display corrected degrees of freedom - print("\n=== CORRECTED DEGREES OF FREEDOM ===") - - sphericity_corr <- mixed_anova_model$`Sphericity Corrections` - anova_table <- mixed_anova_model$ANOVA - - corrected_df <- data.frame( - Effect = sphericity_corr$Effect, - Original_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)], - Original_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)], - GG_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - GG_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$GGe, - HF_DFn = anova_table$DFn[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - HF_DFd = anova_table$DFd[match(sphericity_corr$Effect, anova_table$Effect)] * sphericity_corr$HFe, - GG_epsilon = sphericity_corr$GGe, - HF_epsilon = sphericity_corr$HFe - ) - - print(corrected_df) - - print("\n=== CORRECTED F-TESTS ===") - - # Between-subjects effects (no sphericity corrections needed) - print("\nBETWEEN-SUBJECTS EFFECTS:") - between_effects <- c("TEMPORAL_DO", "INTERVAL_DO", "TEMPORAL_DO:INTERVAL_DO") - for(effect in between_effects) { - if(effect %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == effect] - dfn <- anova_table$DFn[anova_table$Effect == effect] - dfd <- anova_table$DFd[anova_table$Effect == effect] - p_value <- anova_table$p[anova_table$Effect == effect] - - print(sprintf("%s: F(%d, %d) = %.3f, p = %.6f", effect, dfn, dfd, f_value, p_value)) - } - } - - # Within-subjects effects (sphericity corrections where applicable) - print("\nWITHIN-SUBJECTS EFFECTS:") - - # TIME main effect (2 levels, sphericity automatically satisfied) - if("TIME" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "TIME"] - dfn <- anova_table$DFn[anova_table$Effect == "TIME"] - dfd <- anova_table$DFd[anova_table$Effect == "TIME"] - p_value <- anova_table$p[anova_table$Effect == "TIME"] - print(sprintf("TIME: F(%d, %d) = %.3f, p = %.6f (2 levels, sphericity satisfied)", dfn, dfd, f_value, p_value)) - } - - # DOMAIN main effect (3 levels, needs sphericity correction) - if("DOMAIN" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "DOMAIN"] - dfn <- anova_table$DFn[anova_table$Effect == "DOMAIN"] - dfd <- anova_table$DFd[anova_table$Effect == "DOMAIN"] - p_value <- anova_table$p[anova_table$Effect == "DOMAIN"] - print(sprintf("DOMAIN: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # INTERVAL main effect (3 levels, needs sphericity correction) - if("INTERVAL" %in% anova_table$Effect) { - f_value <- anova_table$F[anova_table$Effect == "INTERVAL"] - dfn <- anova_table$DFn[anova_table$Effect == "INTERVAL"] - dfd <- anova_table$DFd[anova_table$Effect == "INTERVAL"] - p_value <- anova_table$p[anova_table$Effect == "INTERVAL"] - print(sprintf("INTERVAL: F(%d, %d) = %.3f, p = %.6f", dfn, dfd, f_value, p_value)) - } - - # Interactions with sphericity corrections - print("\nINTERACTIONS WITH SPHERICITY CORRECTIONS:") - for(i in seq_len(nrow(corrected_df))) { - effect <- corrected_df$Effect[i] - f_value <- anova_table$F[match(effect, anova_table$Effect)] - - print(sprintf("\n%s:", effect)) - print(sprintf(" Original: F(%d, %d) = %.3f", - corrected_df$Original_DFn[i], corrected_df$Original_DFd[i], f_value)) - print(sprintf(" GG-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$GG_DFn[i], corrected_df$GG_DFd[i], f_value, sphericity_corr$`p[GG]`[i])) - print(sprintf(" HF-corrected: F(%.2f, %.2f) = %.3f, p = %.6f", - corrected_df$HF_DFn[i], corrected_df$HF_DFd[i], f_value, sphericity_corr$`p[HF]`[i])) - } -} else { - print("\nNote: Sphericity corrections not needed (sphericity assumption met)") -} - -# ============================================================================= -# EFFECT SIZES (GENERALIZED ETA SQUARED) -# ============================================================================= - - - -# Extract generalized eta squared from ezANOVA (already calculated) -effect_sizes <- mixed_anova_model$ANOVA[, c("Effect", "ges")] -effect_sizes$ges <- round(effect_sizes$ges, 5) - -print(effect_sizes) - -# ============================================================================= -# POST-HOC COMPARISONS -# ============================================================================= - -# Post-hoc comparisons using emmeans - - -# Create aov model for emmeans (emmeans requires aov object, not ezANOVA output) -aov_model <- aov(MEAN_DIFFERENCE ~ TEMPORAL_DO * INTERVAL_DO * TIME * DOMAIN * INTERVAL + Error(ResponseId/(TIME * DOMAIN * INTERVAL)), - data = long_data_clean) - -# Main effect of TIME - -time_emmeans <- emmeans(aov_model, ~ TIME) - -print(time_emmeans) - -time_contrasts <- pairs(time_emmeans, adjust = "bonferroni") -print(time_contrasts) - -# Main effect of DOMAIN - -domain_emmeans <- emmeans(aov_model, ~ DOMAIN) - -print(domain_emmeans) - -domain_contrasts <- pairs(domain_emmeans, adjust = "bonferroni") -print(domain_contrasts) - -# Main effect of INTERVAL - -interval_emmeans <- emmeans(aov_model, ~ INTERVAL) - -print(interval_emmeans) - -interval_contrasts <- pairs(interval_emmeans, adjust = "bonferroni") -print(interval_contrasts) - -# Main effect of TEMPORAL_DO - -temporal_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO) -temporal_contrasts <- pairs(temporal_emmeans, adjust = "bonferroni") -print(temporal_contrasts) - -# Main effect of INTERVAL_DO - -interval_do_emmeans <- emmeans(aov_model, ~ INTERVAL_DO) -interval_do_contrasts <- pairs(interval_do_emmeans, adjust = "bonferroni") -print(interval_do_contrasts) - -# ============================================================================= -# COHEN'S D FOR MAIN EFFECTS -# ============================================================================= - - - -# Main Effect of TIME (if significant) - -time_main_contrast <- pairs(time_emmeans, adjust = "none") -time_main_df <- as.data.frame(time_main_contrast) - -print(time_main_df) - -# Calculate Cohen's d for TIME main effect -if(nrow(time_main_df) > 0) { - print("\nCohen's d for TIME main effect:") - time_past_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - time_future_data <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - time_cohens_d <- cohen.d(time_past_data, time_future_data) - print(sprintf("Past vs Future: n1 = %d, n2 = %d", length(time_past_data), length(time_future_data))) - print(sprintf("Cohen's d: %.5f", time_cohens_d$estimate)) - print(sprintf("Effect size interpretation: %s", time_cohens_d$magnitude)) - print(sprintf("p-value: %.5f", time_main_df$p.value[1])) -} - -# Main Effect of DOMAIN (if significant) - -domain_main_contrast <- pairs(domain_emmeans, adjust = "bonferroni") -domain_main_df <- as.data.frame(domain_main_contrast) - -print(domain_main_df) - -# Calculate Cohen's d for significant DOMAIN contrasts -significant_domain <- domain_main_df[domain_main_df$p.value < 0.05, ] -if(nrow(significant_domain) > 0) { - print("\nCohen's d for significant DOMAIN contrasts:") - for(i in seq_len(nrow(significant_domain))) { - contrast_name <- as.character(significant_domain$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$DOMAIN == level2] - - if(length(data1) > 0 && length(data2) > 0) { - domain_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", domain_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", domain_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_domain$p.value[i])) - print("") - } - } - } -} - -# Main Effect of INTERVAL (if significant) - -interval_main_contrast <- pairs(interval_emmeans, adjust = "bonferroni") -interval_main_df <- as.data.frame(interval_main_contrast) - -print(interval_main_df) - -# Calculate Cohen's d for significant INTERVAL contrasts -significant_interval <- interval_main_df[interval_main_df$p.value < 0.05, ] -if(nrow(significant_interval) > 0) { - print("\nCohen's d for significant INTERVAL contrasts:") - for(i in seq_len(nrow(significant_interval))) { - contrast_name <- as.character(significant_interval$contrast[i]) - contrast_parts <- strsplit(contrast_name, " - ")[[1]] - if(length(contrast_parts) == 2) { - level1 <- trimws(contrast_parts[1]) - level2 <- trimws(contrast_parts[2]) - - data1 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level1] - data2 <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$INTERVAL == level2] - - if(length(data1) > 0 && length(data2) > 0) { - interval_cohens_d <- cohen.d(data1, data2) - print(sprintf("Comparison: %s", contrast_name)) - print(sprintf(" n1 = %d, n2 = %d", length(data1), length(data2))) - print(sprintf(" Cohen's d: %.5f", interval_cohens_d$estimate)) - print(sprintf(" Effect size interpretation: %s", interval_cohens_d$magnitude)) - print(sprintf(" p-value: %.5f", significant_interval$p.value[i])) - print("") - } - } - } -} - -# ============================================================================= -# INTERACTION EXPLORATIONS (if significant) -# ============================================================================= - - - -# First, identify which interactions are significant -significant_interactions <- anova_output[anova_output$p < 0.05 & grepl(":", anova_output$Effect), ] - -if(nrow(significant_interactions) > 0) { - print("Significant interactions found:") - print(significant_interactions[, c("Effect", "p")]) - - # TIME × DOMAIN interaction (if significant) - if("TIME:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:DOMAIN"] < 0.05) { - print("\n=== TIME × DOMAIN INTERACTION (SIGNIFICANT) ===") - time_domain_emmeans <- emmeans(aov_model, ~ TIME * DOMAIN) - print("Estimated Marginal Means:") - print(time_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TIME:") - time_domain_simple <- pairs(time_domain_emmeans, by = "TIME", adjust = "bonferroni") - print(time_domain_simple) - - print("\nSimple Effects of TIME within each DOMAIN:") - time_domain_simple2 <- pairs(time_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(time_domain_simple2) - } - - # TIME × INTERVAL interaction (if significant) - if("TIME:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TIME:INTERVAL"] < 0.05) { - print("\n=== TIME × INTERVAL INTERACTION (SIGNIFICANT) ===") - time_interval_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL) - print("Estimated Marginal Means:") - print(time_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TIME:") - time_interval_simple <- pairs(time_interval_emmeans, by = "TIME", adjust = "bonferroni") - print(time_interval_simple) - - print("\nSimple Effects of TIME within each INTERVAL:") - time_interval_simple2 <- pairs(time_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(time_interval_simple2) - } - - # DOMAIN × INTERVAL interaction (if significant) - if("DOMAIN:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "DOMAIN:INTERVAL"] < 0.05) { - print("\n=== DOMAIN × INTERVAL INTERACTION (SIGNIFICANT) ===") - domain_interval_emmeans <- emmeans(aov_model, ~ DOMAIN * INTERVAL) - print("Estimated Marginal Means:") - print(domain_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each DOMAIN:") - domain_interval_simple <- pairs(domain_interval_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(domain_interval_simple) - - print("\nSimple Effects of DOMAIN within each INTERVAL:") - domain_interval_simple2 <- pairs(domain_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(domain_interval_simple2) - } - - # TEMPORAL_DO × TIME interaction (if significant) - if("TEMPORAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:TIME"] < 0.05) { - print("\n=== TEMPORAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - temporal_time_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * TIME) - print("Estimated Marginal Means:") - print(temporal_time_emmeans) - - print("\nSimple Effects of TIME within each TEMPORAL_DO:") - temporal_time_simple <- pairs(temporal_time_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_time_simple) - - print("\nSimple Effects of TEMPORAL_DO within each TIME:") - temporal_time_simple2 <- pairs(temporal_time_emmeans, by = "TIME", adjust = "bonferroni") - print(temporal_time_simple2) - } - - # TEMPORAL_DO × DOMAIN interaction (if significant) - if("TEMPORAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:DOMAIN"] < 0.05) { - print("\n=== TEMPORAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - temporal_domain_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(temporal_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each TEMPORAL_DO:") - temporal_domain_simple <- pairs(temporal_domain_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_domain_simple) - - print("\nSimple Effects of TEMPORAL_DO within each DOMAIN:") - temporal_domain_simple2 <- pairs(temporal_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(temporal_domain_simple2) - } - - # TEMPORAL_DO × INTERVAL interaction (if significant) - if("TEMPORAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "TEMPORAL_DO:INTERVAL"] < 0.05) { - print("\n=== TEMPORAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - temporal_interval_emmeans <- emmeans(aov_model, ~ TEMPORAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(temporal_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each TEMPORAL_DO:") - temporal_interval_simple <- pairs(temporal_interval_emmeans, by = "TEMPORAL_DO", adjust = "bonferroni") - print(temporal_interval_simple) - - print("\nSimple Effects of TEMPORAL_DO within each INTERVAL:") - temporal_interval_simple2 <- pairs(temporal_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(temporal_interval_simple2) - } - - # INTERVAL_DO × TIME interaction (if significant) - if("INTERVAL_DO:TIME" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:TIME"] < 0.05) { - print("\n=== INTERVAL_DO × TIME INTERACTION (SIGNIFICANT) ===") - interval_do_time_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * TIME) - print("Estimated Marginal Means:") - print(interval_do_time_emmeans) - - print("\nSimple Effects of TIME within each INTERVAL_DO:") - interval_do_time_simple <- pairs(interval_do_time_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_time_simple) - - print("\nSimple Effects of INTERVAL_DO within each TIME:") - interval_do_time_simple2 <- pairs(interval_do_time_emmeans, by = "TIME", adjust = "bonferroni") - print(interval_do_time_simple2) - } - - # INTERVAL_DO × DOMAIN interaction (if significant) - if("INTERVAL_DO:DOMAIN" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:DOMAIN"] < 0.05) { - print("\n=== INTERVAL_DO × DOMAIN INTERACTION (SIGNIFICANT) ===") - interval_do_domain_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * DOMAIN) - print("Estimated Marginal Means:") - print(interval_do_domain_emmeans) - - print("\nSimple Effects of DOMAIN within each INTERVAL_DO:") - interval_do_domain_simple <- pairs(interval_do_domain_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_domain_simple) - - print("\nSimple Effects of INTERVAL_DO within each DOMAIN:") - interval_do_domain_simple2 <- pairs(interval_do_domain_emmeans, by = "DOMAIN", adjust = "bonferroni") - print(interval_do_domain_simple2) - } - - # INTERVAL_DO × INTERVAL interaction (if significant) - if("INTERVAL_DO:INTERVAL" %in% anova_output$Effect && anova_output$p[anova_output$Effect == "INTERVAL_DO:INTERVAL"] < 0.05) { - print("\n=== INTERVAL_DO × INTERVAL INTERACTION (SIGNIFICANT) ===") - interval_do_interval_emmeans <- emmeans(aov_model, ~ INTERVAL_DO * INTERVAL) - print("Estimated Marginal Means:") - print(interval_do_interval_emmeans) - - print("\nSimple Effects of INTERVAL within each INTERVAL_DO:") - interval_do_interval_simple <- pairs(interval_do_interval_emmeans, by = "INTERVAL_DO", adjust = "bonferroni") - print(interval_do_interval_simple) - - print("\nSimple Effects of INTERVAL_DO within each INTERVAL:") - interval_do_interval_simple2 <- pairs(interval_do_interval_emmeans, by = "INTERVAL", adjust = "bonferroni") - print(interval_do_interval_simple2) - } - - # THREE-WAY INTERACTION: TIME × INTERVAL × TEMPORAL_DO - print("\n=== TIME × INTERVAL × TEMPORAL_DO THREE-WAY INTERACTION ===") - three_way_emmeans <- emmeans(aov_model, ~ TIME * INTERVAL * TEMPORAL_DO) - print("Estimated Marginal Means:") - print(three_way_emmeans) - - print("\nPast vs Future contrasts within each INTERVAL × TEMPORAL_DO combination:") - three_way_contrasts <- pairs(three_way_emmeans, by = c("INTERVAL", "TEMPORAL_DO"), adjust = "bonferroni") - print(three_way_contrasts) - - # Calculate Cohen's d for Past vs Future contrasts - three_way_contrasts_df <- as.data.frame(three_way_contrasts) - - print("\n=== COHEN'S D FOR TIME CONTRASTS IN THREE-WAY INTERACTION ===") - print("Past vs Future contrasts for all INTERVAL × TEMPORAL_DO combinations:") - - for(i in seq_len(nrow(three_way_contrasts_df))) { - comparison <- three_way_contrasts_df[i, ] - - # Extract the grouping variables - interval_level <- as.character(comparison$INTERVAL) - temporal_do_level <- as.character(comparison$TEMPORAL_DO) - - # Get data for Past and Future within this INTERVAL × TEMPORAL_DO combination - past_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Past" - ] - - future_data <- long_data_clean$MEAN_DIFFERENCE[ - long_data_clean$INTERVAL == interval_level & - long_data_clean$TEMPORAL_DO == temporal_do_level & - long_data_clean$TIME == "Future" - ] - - if(length(past_data) > 0 && length(future_data) > 0) { - # Calculate Cohen's d using effsize package - cohens_d_result <- cohen.d(past_data, future_data) - - cat(sprintf("INTERVAL = %s, TEMPORAL_DO = %s:\n", interval_level, temporal_do_level)) - cat(sprintf(" Past vs Future comparison\n")) - cat(sprintf(" n_Past = %d, n_Future = %d\n", length(past_data), length(future_data))) - cat(sprintf(" Cohen's d: %.5f\n", cohens_d_result$estimate)) - cat(sprintf(" Effect size interpretation: %s\n", cohens_d_result$magnitude)) - cat(sprintf(" p-value: %.5f\n", comparison$p.value)) - cat(sprintf(" Estimated difference: %.5f\n", comparison$estimate)) - cat("\n") - } - } - -} else { - print("No significant interactions found.") - print("All interaction p-values:") - interaction_effects <- anova_output[grepl(":", anova_output$Effect), ] - print(interaction_effects[, c("Effect", "p")]) -} - - - -# ============================================================================= -# INTERACTION PLOT: TEMPORAL_DO × TIME × INTERVAL (Emmeans only) -# ============================================================================= - - - -# Define color palette for TIME -time_colors <- c("Past" = "#648FFF", "Future" = "#DC267F") - -# Create emmeans for TEMPORAL_DO × TIME × INTERVAL -emm_3way <- emmeans(aov_model, ~ TEMPORAL_DO * TIME * INTERVAL) - -# Prepare emmeans data frame -emmeans_3way <- emm_3way %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TEMPORAL_DO = factor(TEMPORAL_DO, levels = c("01PAST", "02FUT")), - TIME = factor(TIME, levels = c("Past", "Future")), - INTERVAL = factor(INTERVAL), - x_pos = as.numeric(TEMPORAL_DO), - time_offset = (as.numeric(TIME) - 1.5) * 0.2, - x_dodged = x_pos + time_offset - ) - -# Create 3-way interaction plot with facets -interaction_plot_3way <- ggplot(emmeans_3way) + - geom_errorbar( - aes(x = x_dodged, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.1, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = x_dodged, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - facet_wrap(~INTERVAL, nrow = 1, labeller = labeller(INTERVAL = c( - "5" = "Present v. 5 Years", - "10" = "Present v. 10 Years", - "5_10" = "5 Years v. 10 Years" - ))) + - labs( - x = "Order", - y = "Mean absolute deviation", - title = "TEMPORAL_DO × TIME × INTERVAL Interaction (Estimated Marginal Means)" - ) + - scale_x_continuous( - breaks = c(1, 2), - labels = c("Past First", "Future First"), - limits = c(0.5, 2.5) - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5), - strip.text = element_text(size = 11, face = "bold") - ) - -print(interaction_plot_3way) - -# ============================================================================= -# MAIN EFFECT PLOT: TIME (Emmeans + Error Bars) -# ============================================================================= - -# Prepare emmeans data frame for TIME main effect -time_emm_df <- time_emmeans %>% - as.data.frame() %>% - filter(!is.na(lower.CL) & !is.na(upper.CL) & !is.na(emmean)) %>% - rename( - ci_lower = lower.CL, - ci_upper = upper.CL, - plot_mean = emmean - ) %>% - mutate( - TIME = factor(TIME, levels = c("Past", "Future")) - ) - -# Create TIME main-effect plot (style aligned with the 3-way interaction plot) -time_main_plot <- ggplot(time_emm_df) + - geom_errorbar( - aes(x = TIME, ymin = ci_lower, ymax = ci_upper, color = TIME), - width = 0.15, - linewidth = 1, - alpha = 0.8 - ) + - geom_point( - aes(x = TIME, y = plot_mean, fill = TIME, shape = TIME), - size = 5, - stroke = 1.2, - color = "black" - ) + - labs( - x = "Time", - y = "Mean absolute difference from present", - title = "Main Effect of TIME (Estimated Marginal Means)" - ) + - scale_color_manual(name = "Temporal Direction", values = time_colors) + - scale_fill_manual(name = "Temporal Direction", values = time_colors) + - scale_shape_manual(name = "Temporal Direction", values = c(21, 22)) + - theme_minimal(base_size = 13) + - theme( - axis.text = element_text(size = 11), - axis.title = element_text(size = 12), - plot.title = element_text(size = 14, hjust = 0.5), - legend.position = "right", - legend.title = element_text(size = 11), - legend.text = element_text(size = 10), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank(), - panel.border = element_rect(color = "gray80", fill = NA, linewidth = 0.5) - ) - -print(time_main_plot) \ No newline at end of file diff --git a/.history/eohi2/recode_likert_items_20251001085552.r b/.history/eohi2/recode_likert_items_20251001085552.r deleted file mode 100644 index ea1e3ca..0000000 --- a/.history/eohi2/recode_likert_items_20251001085552.r +++ /dev/null @@ -1,93 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -# Read the data -df <- read.csv("eohi2/eohi2.csv", stringsAsFactors = FALSE) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Coalesce: take value from col_A if present, otherwise from col_B - combined <- ifelse(!is.na(df[[col_A]]) & df[[col_A]] != "", - df[[col_A]], - df[[col_B]]) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# Save the modified dataframe back to CSV -write.csv(df, "eohi2/eohi2.csv", row.names = FALSE) - -cat("\nProcessing complete! 60 new columns added to eohi2/eohi2.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001085616.r b/.history/eohi2/recode_likert_items_20251001085616.r deleted file mode 100644 index ea1e3ca..0000000 --- a/.history/eohi2/recode_likert_items_20251001085616.r +++ /dev/null @@ -1,93 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -# Read the data -df <- read.csv("eohi2/eohi2.csv", stringsAsFactors = FALSE) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Coalesce: take value from col_A if present, otherwise from col_B - combined <- ifelse(!is.na(df[[col_A]]) & df[[col_A]] != "", - df[[col_A]], - df[[col_B]]) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# Save the modified dataframe back to CSV -write.csv(df, "eohi2/eohi2.csv", row.names = FALSE) - -cat("\nProcessing complete! 60 new columns added to eohi2/eohi2.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001085904.r b/.history/eohi2/recode_likert_items_20251001085904.r deleted file mode 100644 index adc1262..0000000 --- a/.history/eohi2/recode_likert_items_20251001085904.r +++ /dev/null @@ -1,95 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Coalesce: take value from col_A if present, otherwise from col_B - combined <- ifelse(!is.na(df[[col_A]]) & df[[col_A]] != "", - df[[col_A]], - df[[col_B]]) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# Save the modified dataframe back to CSV -write.csv(df, "eohi2/eohi3.csv", row.names = FALSE) - -cat("\nProcessing complete! 60 new columns added to eohi2/eohi3.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001085909.r b/.history/eohi2/recode_likert_items_20251001085909.r deleted file mode 100644 index 8a9a5a9..0000000 --- a/.history/eohi2/recode_likert_items_20251001085909.r +++ /dev/null @@ -1,95 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Coalesce: take value from col_A if present, otherwise from col_B - combined <- ifelse(!is.na(df[[col_A]]) & df[[col_A]] != "", - df[[col_A]], - df[[col_B]]) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# Save the modified dataframe back to CSV -write.csv(df, "eohi3.csv", row.names = FALSE) - -cat("\nProcessing complete! 60 new columns added to eohi3.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001085917.r b/.history/eohi2/recode_likert_items_20251001085917.r deleted file mode 100644 index 8a9a5a9..0000000 --- a/.history/eohi2/recode_likert_items_20251001085917.r +++ /dev/null @@ -1,95 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Coalesce: take value from col_A if present, otherwise from col_B - combined <- ifelse(!is.na(df[[col_A]]) & df[[col_A]] != "", - df[[col_A]], - df[[col_B]]) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# Save the modified dataframe back to CSV -write.csv(df, "eohi3.csv", row.names = FALSE) - -cat("\nProcessing complete! 60 new columns added to eohi3.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001090003.r b/.history/eohi2/recode_likert_items_20251001090003.r deleted file mode 100644 index 8a9a5a9..0000000 --- a/.history/eohi2/recode_likert_items_20251001090003.r +++ /dev/null @@ -1,95 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Coalesce: take value from col_A if present, otherwise from col_B - combined <- ifelse(!is.na(df[[col_A]]) & df[[col_A]] != "", - df[[col_A]], - df[[col_B]]) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# Save the modified dataframe back to CSV -write.csv(df, "eohi3.csv", row.names = FALSE) - -cat("\nProcessing complete! 60 new columns added to eohi3.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001090613.r b/.history/eohi2/recode_likert_items_20251001090613.r deleted file mode 100644 index 40a0bd3..0000000 --- a/.history/eohi2/recode_likert_items_20251001090613.r +++ /dev/null @@ -1,99 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# Save the modified dataframe back to CSV -write.csv(df, "eohi3.csv", row.names = FALSE) - -cat("\nProcessing complete! 60 new columns added to eohi3.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001090620.r b/.history/eohi2/recode_likert_items_20251001090620.r deleted file mode 100644 index 40a0bd3..0000000 --- a/.history/eohi2/recode_likert_items_20251001090620.r +++ /dev/null @@ -1,99 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# Save the modified dataframe back to CSV -write.csv(df, "eohi3.csv", row.names = FALSE) - -cat("\nProcessing complete! 60 new columns added to eohi3.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001090829.r b/.history/eohi2/recode_likert_items_20251001090829.r deleted file mode 100644 index 3a1025b..0000000 --- a/.history/eohi2/recode_likert_items_20251001090829.r +++ /dev/null @@ -1,164 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 30 || length(missing_B) > 30) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# Save the modified dataframe back to CSV -write.csv(df, "eohi3.csv", row.names = FALSE) - -cat("\nProcessing complete! 60 new columns added to eohi3.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001090840.r b/.history/eohi2/recode_likert_items_20251001090840.r deleted file mode 100644 index 3a1025b..0000000 --- a/.history/eohi2/recode_likert_items_20251001090840.r +++ /dev/null @@ -1,164 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 30 || length(missing_B) > 30) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# Save the modified dataframe back to CSV -write.csv(df, "eohi3.csv", row.names = FALSE) - -cat("\nProcessing complete! 60 new columns added to eohi3.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001090919.r b/.history/eohi2/recode_likert_items_20251001090919.r deleted file mode 100644 index 3a1025b..0000000 --- a/.history/eohi2/recode_likert_items_20251001090919.r +++ /dev/null @@ -1,164 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 30 || length(missing_B) > 30) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# Save the modified dataframe back to CSV -write.csv(df, "eohi3.csv", row.names = FALSE) - -cat("\nProcessing complete! 60 new columns added to eohi3.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001091002.r b/.history/eohi2/recode_likert_items_20251001091002.r deleted file mode 100644 index 1818b13..0000000 --- a/.history/eohi2/recode_likert_items_20251001091002.r +++ /dev/null @@ -1,191 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 30 || length(missing_B) > 30) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 60 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 60 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - -# Save the modified dataframe back to CSV -write.csv(df, "eohi3.csv", row.names = FALSE) - -cat("\nProcessing complete! 60 new columns added to eohi3.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001091011.r b/.history/eohi2/recode_likert_items_20251001091011.r deleted file mode 100644 index 1818b13..0000000 --- a/.history/eohi2/recode_likert_items_20251001091011.r +++ /dev/null @@ -1,191 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 30 || length(missing_B) > 30) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 60 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 60 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - -# Save the modified dataframe back to CSV -write.csv(df, "eohi3.csv", row.names = FALSE) - -cat("\nProcessing complete! 60 new columns added to eohi3.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001091016.r b/.history/eohi2/recode_likert_items_20251001091016.r deleted file mode 100644 index 1818b13..0000000 --- a/.history/eohi2/recode_likert_items_20251001091016.r +++ /dev/null @@ -1,191 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 30 || length(missing_B) > 30) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 60 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 60 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - -# Save the modified dataframe back to CSV -write.csv(df, "eohi3.csv", row.names = FALSE) - -cat("\nProcessing complete! 60 new columns added to eohi3.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001091544.r b/.history/eohi2/recode_likert_items_20251001091544.r deleted file mode 100644 index bae773a..0000000 --- a/.history/eohi2/recode_likert_items_20251001091544.r +++ /dev/null @@ -1,191 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 30 || length(missing_B) > 30) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 60 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 60 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - -# Save the modified dataframe back to CSV -write.csv(df, "eohi3.csv", row.names = FALSE) - -cat("\nProcessing complete! 60 new columns added to eohi3.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001091551.r b/.history/eohi2/recode_likert_items_20251001091551.r deleted file mode 100644 index 015fe05..0000000 --- a/.history/eohi2/recode_likert_items_20251001091551.r +++ /dev/null @@ -1,198 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 30 || length(missing_B) > 30) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 60 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 60 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - -# Save the modified dataframe back to CSV -write.csv(df, "eohi3.csv", row.names = FALSE) - -cat("\nProcessing complete! 60 new columns added to eohi3.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001091602.r b/.history/eohi2/recode_likert_items_20251001091602.r deleted file mode 100644 index 015fe05..0000000 --- a/.history/eohi2/recode_likert_items_20251001091602.r +++ /dev/null @@ -1,198 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 30 || length(missing_B) > 30) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 60 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 60 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - -# Save the modified dataframe back to CSV -write.csv(df, "eohi3.csv", row.names = FALSE) - -cat("\nProcessing complete! 60 new columns added to eohi3.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001091838.r b/.history/eohi2/recode_likert_items_20251001091838.r deleted file mode 100644 index f08c3c8..0000000 --- a/.history/eohi2/recode_likert_items_20251001091838.r +++ /dev/null @@ -1,199 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings="" keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 30 || length(missing_B) > 30) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 60 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 60 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - -# Save the modified dataframe back to CSV -write.csv(df, "eohi3.csv", row.names = FALSE) - -cat("\nProcessing complete! 60 new columns added to eohi3.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001091844.r b/.history/eohi2/recode_likert_items_20251001091844.r deleted file mode 100644 index f9c5975..0000000 --- a/.history/eohi2/recode_likert_items_20251001091844.r +++ /dev/null @@ -1,200 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings="" keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 30 || length(missing_B) > 30) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 60 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 60 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -write.csv(df, "eohi3.csv", row.names = FALSE, na = "") - -cat("\nProcessing complete! 60 new columns added to eohi3.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001091846.r b/.history/eohi2/recode_likert_items_20251001091846.r deleted file mode 100644 index f9c5975..0000000 --- a/.history/eohi2/recode_likert_items_20251001091846.r +++ /dev/null @@ -1,200 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings="" keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 30 || length(missing_B) > 30) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 60 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 60 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -write.csv(df, "eohi3.csv", row.names = FALSE, na = "") - -cat("\nProcessing complete! 60 new columns added to eohi3.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001091852.r b/.history/eohi2/recode_likert_items_20251001091852.r deleted file mode 100644 index f9c5975..0000000 --- a/.history/eohi2/recode_likert_items_20251001091852.r +++ /dev/null @@ -1,200 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings="" keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 30 || length(missing_B) > 30) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 60 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 60 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -write.csv(df, "eohi3.csv", row.names = FALSE, na = "") - -cat("\nProcessing complete! 60 new columns added to eohi3.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001092353.r b/.history/eohi2/recode_likert_items_20251001092353.r deleted file mode 100644 index 1e2ccca..0000000 --- a/.history/eohi2/recode_likert_items_20251001092353.r +++ /dev/null @@ -1,200 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings="" keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 30 || length(missing_B) > 30) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 60 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 60 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\nProcessing complete! 60 new columns added to eohi2.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001092409.r b/.history/eohi2/recode_likert_items_20251001092409.r deleted file mode 100644 index de1332d..0000000 --- a/.history/eohi2/recode_likert_items_20251001092409.r +++ /dev/null @@ -1,266 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings="" keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 30 || length(missing_B) > 30) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 60 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 60 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 60 pairs - for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values - val_A <- if (col_A %in% names(df)) df[random_row, col_A] else "" - val_B <- if (col_B %in% names(df)) df[random_row, col_B] else "" - target_val <- df[random_row, target_col] - - # Determine which source had the value - has_val_A <- !is.na(val_A) && val_A != "" - has_val_B <- !is.na(val_B) && val_B != "" - - if (has_val_A) { - source_used <- "A" - original_text <- val_A - } else if (has_val_B) { - source_used <- "B" - original_text <- val_B - } else { - source_used <- "NONE" - original_text <- "(empty)" - } - - # Print the info - cat(sprintf("Pair %2d:\n", i)) - cat(sprintf(" Source A: %-30s\n", col_A)) - cat(sprintf(" Source B: %-30s\n", col_B)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Value found in: Source %s\n", source_used)) - cat(sprintf(" Original text: '%s'\n", original_text)) - cat(sprintf(" Numeric value: %s\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\nProcessing complete! 60 new columns added to eohi2.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001092420.r b/.history/eohi2/recode_likert_items_20251001092420.r deleted file mode 100644 index de1332d..0000000 --- a/.history/eohi2/recode_likert_items_20251001092420.r +++ /dev/null @@ -1,266 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings="" keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 30 || length(missing_B) > 30) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 60 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 60 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 60 pairs - for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values - val_A <- if (col_A %in% names(df)) df[random_row, col_A] else "" - val_B <- if (col_B %in% names(df)) df[random_row, col_B] else "" - target_val <- df[random_row, target_col] - - # Determine which source had the value - has_val_A <- !is.na(val_A) && val_A != "" - has_val_B <- !is.na(val_B) && val_B != "" - - if (has_val_A) { - source_used <- "A" - original_text <- val_A - } else if (has_val_B) { - source_used <- "B" - original_text <- val_B - } else { - source_used <- "NONE" - original_text <- "(empty)" - } - - # Print the info - cat(sprintf("Pair %2d:\n", i)) - cat(sprintf(" Source A: %-30s\n", col_A)) - cat(sprintf(" Source B: %-30s\n", col_B)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Value found in: Source %s\n", source_used)) - cat(sprintf(" Original text: '%s'\n", original_text)) - cat(sprintf(" Numeric value: %s\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\nProcessing complete! 60 new columns added to eohi2.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001092430.r b/.history/eohi2/recode_likert_items_20251001092430.r deleted file mode 100644 index de1332d..0000000 --- a/.history/eohi2/recode_likert_items_20251001092430.r +++ /dev/null @@ -1,266 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings="" keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 30 || length(missing_B) > 30) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 60 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 60 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 60 pairs - for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values - val_A <- if (col_A %in% names(df)) df[random_row, col_A] else "" - val_B <- if (col_B %in% names(df)) df[random_row, col_B] else "" - target_val <- df[random_row, target_col] - - # Determine which source had the value - has_val_A <- !is.na(val_A) && val_A != "" - has_val_B <- !is.na(val_B) && val_B != "" - - if (has_val_A) { - source_used <- "A" - original_text <- val_A - } else if (has_val_B) { - source_used <- "B" - original_text <- val_B - } else { - source_used <- "NONE" - original_text <- "(empty)" - } - - # Print the info - cat(sprintf("Pair %2d:\n", i)) - cat(sprintf(" Source A: %-30s\n", col_A)) - cat(sprintf(" Source B: %-30s\n", col_B)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Value found in: Source %s\n", source_used)) - cat(sprintf(" Original text: '%s'\n", original_text)) - cat(sprintf(" Numeric value: %s\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\nProcessing complete! 60 new columns added to eohi2.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001094502.r b/.history/eohi2/recode_likert_items_20251001094502.r deleted file mode 100644 index de1332d..0000000 --- a/.history/eohi2/recode_likert_items_20251001094502.r +++ /dev/null @@ -1,266 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings="" keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 30 || length(missing_B) > 30) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 60 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 60 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 60 pairs - for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values - val_A <- if (col_A %in% names(df)) df[random_row, col_A] else "" - val_B <- if (col_B %in% names(df)) df[random_row, col_B] else "" - target_val <- df[random_row, target_col] - - # Determine which source had the value - has_val_A <- !is.na(val_A) && val_A != "" - has_val_B <- !is.na(val_B) && val_B != "" - - if (has_val_A) { - source_used <- "A" - original_text <- val_A - } else if (has_val_B) { - source_used <- "B" - original_text <- val_B - } else { - source_used <- "NONE" - original_text <- "(empty)" - } - - # Print the info - cat(sprintf("Pair %2d:\n", i)) - cat(sprintf(" Source A: %-30s\n", col_A)) - cat(sprintf(" Source B: %-30s\n", col_B)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Value found in: Source %s\n", source_used)) - cat(sprintf(" Original text: '%s'\n", original_text)) - cat(sprintf(" Numeric value: %s\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\nProcessing complete! 60 new columns added to eohi2.csv\n") - diff --git a/.history/eohi2/recode_likert_items_20251001105906.r b/.history/eohi2/recode_likert_items_20251001105906.r deleted file mode 100644 index de1332d..0000000 --- a/.history/eohi2/recode_likert_items_20251001105906.r +++ /dev/null @@ -1,266 +0,0 @@ -# Script to combine and recode Likert scale items in eohi2.csv -# Combines 01 and 02 versions of items, then recodes text to numeric values - -# Load necessary library -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -# Read the data (with check.names=FALSE to preserve original column names) -# na.strings="" keeps empty cells as empty strings instead of converting to NA -df <- read.csv("eohi2.csv", stringsAsFactors = FALSE, check.names = FALSE, na.strings = NULL) - -# Define the mapping function -recode_likert <- function(x) { - case_when( - tolower(x) == "strongly disagree" ~ -3, - tolower(x) == "disagree" ~ -2, - tolower(x) == "somewhat disagree" ~ -1, - tolower(x) == "neither agree nor disagree" ~ 0, - tolower(x) == "somewhat agree" ~ 1, - tolower(x) == "agree" ~ 2, - tolower(x) == "strongly agree" ~ 3, - TRUE ~ NA_real_ - ) -} - -# Define source column pairs (Set A and Set B) -source_cols_A <- c( - "01past5PrefItem_1", "01past5PrefItem_2", "01past5PrefItem_3", "01past5PrefItem_4", "01past5PrefItem_5", - "01past5PersItem_1", "01past5PersItem_2", "01past5PersItem_3", "01past5PersItem_4", "01past5PersItem_5", - "01past5ValItem_1", "01past5ValItem_2", "01past5ValItem_3", "01past5ValItem_4", "01past5ValItem_5", - "01past10PrefItem_1", "01past10PrefItem_2", "01past10PrefItem_3", "01past10PrefItem_4", "01past10PrefItem_5", - "01past10PersItem_1", "01past10PersItem_2", "01past10PersItem_3", "01past10PersItem_4", "01past10PersItem_5", - "01past10ValItem_1", "01past10ValItem_2", "01past10ValItem_3", "01past10ValItem_4", "01past10ValItem_5", - "01fut5PrefItem_1", "01fut5PrefItem_2", "01fut5PrefItem_3", "01fut5PrefItem_4", "01fut5PrefItem_5", - "01fut5PersItem_1", "01fut5PersItem_2", "01fut5PersItem_3", "01fut5PersItem_4", "01fut5PersItem_5", - "01fut5ValItem_1", "01fut5ValItem_2", "01fut5ValItem_3", "01fut5ValItem_4", "01fut5ValItem_5", - "01fut10PrefItem_1", "01fut10PrefItem_2", "01fut10PrefItem_3", "01fut10PrefItem_4", "01fut10PrefItem_5", - "01fut10PersItem_1", "01fut10PersItem_2", "01fut10PersItem_3", "01fut10PersItem_4", "01fut10PersItem_5", - "01fut10ValItem_1", "01fut10ValItem_2", "01fut10ValItem_3", "01fut10ValItem_4", "01fut10ValItem_5" -) - -source_cols_B <- c( - "02past5PrefItem_1", "02past5PrefItem_2", "02past5PrefItem_3", "02past5PrefItem_4", "02past5PrefItem_5", - "02past5PersItem_1", "02past5PersItem_2", "02past5PersItem_3", "02past5PersItem_4", "02past5PersItem_5", - "02past5ValItem_1", "02past5ValItem_2", "02past5ValItem_3", "02past5ValItem_4", "02past5ValItem_5", - "02past10PrefItem_1", "02past10PrefItem_2", "02past10PrefItem_3", "02past10PrefItem_4", "02past10PrefItem_5", - "02past10PersItem_1", "02past10PersItem_2", "02past10PersItem_3", "02past10PersItem_4", "02past10PersItem_5", - "02past10ValItem_1", "02past10ValItem_2", "02past10ValItem_3", "02past10ValItem_4", "02past10ValItem_5", - "02fut5PrefItem_1", "02fut5PrefItem_2", "02fut5PrefItem_3", "02fut5PrefItem_4", "02fut5PrefItem_5", - "02fut5PersItem_1", "02fut5PersItem_2", "02fut5PersItem_3", "02fut5PersItem_4", "02fut5PersItem_5", - "02fut5ValItem_1", "02fut5ValItem_2", "02fut5ValItem_3", "02fut5ValItem_4", "02fut5ValItem_5", - "02fut10PrefItem_1", "02fut10PrefItem_2", "02fut10PrefItem_3", "02fut10PrefItem_4", "02fut10PrefItem_5", - "02fut10PersItem_1", "02fut10PersItem_2", "02fut10PersItem_3", "02fut10PersItem_4", "02fut10PersItem_5", - "02fut10ValItem_1", "02fut10ValItem_2", "02fut10ValItem_3", "02fut10ValItem_4", "02fut10ValItem_5" -) - -# Define target column names -target_cols <- c( - "past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel", - "past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex", - "past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice", - "past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel", - "past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex", - "past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice", - "fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel", - "fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex", - "fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice", - "fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel", - "fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex", - "fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice" -) - -# ============= TROUBLESHOOTING: CHECK COLUMN EXISTENCE ============= -cat("\n=== COLUMN EXISTENCE CHECK ===\n\n") - -# Get actual column names from dataframe (trimmed) -df_cols <- trimws(names(df)) - -# Print first 30 actual column names for debugging -cat("First 30 actual column names in CSV:\n") -for (i in 1:min(30, length(df_cols))) { - cat(sprintf(" %2d. '%s' (length: %d)\n", i, df_cols[i], nchar(df_cols[i]))) -} -cat("\n") - -# Check Source A columns -missing_A <- source_cols_A[!source_cols_A %in% df_cols] -existing_A <- source_cols_A[source_cols_A %in% df_cols] - -cat("Source Set A:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_A), "columns\n") -cat(" Missing:", length(missing_A), "columns\n") - -if (length(missing_A) > 0) { - cat("\n Missing columns from Set A:\n") - for (col in missing_A) { - cat(" -", col, "\n") - } -} - -# Check Source B columns -missing_B <- source_cols_B[!source_cols_B %in% df_cols] -existing_B <- source_cols_B[source_cols_B %in% df_cols] - -cat("\nSource Set B:\n") -cat(" Expected: 60 columns\n") -cat(" Found:", length(existing_B), "columns\n") -cat(" Missing:", length(missing_B), "columns\n") - -if (length(missing_B) > 0) { - cat("\n Missing columns from Set B:\n") - for (col in missing_B) { - cat(" -", col, "\n") - } -} - -# Check for columns with similar names (potential typos/spaces) -if (length(missing_A) > 0 || length(missing_B) > 0) { - cat("\n\n=== CHECKING FOR SIMILAR COLUMN NAMES ===\n") - all_missing <- c(missing_A, missing_B) - for (miss_col in all_missing) { - # Find columns that start with similar pattern - pattern <- substr(miss_col, 1, 10) - similar <- grep(pattern, df_cols, value = TRUE, ignore.case = TRUE) - if (length(similar) > 0) { - cat("\nLooking for:", miss_col) - cat("\n Similar columns found:\n") - for (sim in similar) { - cat(" - '", sim, "' (length:", nchar(sim), ")\n", sep = "") - } - } - } -} - -cat("\n=== END CHECK ===\n\n") - -# Stop if critical columns are missing -if (length(missing_A) > 30 || length(missing_B) > 30) { - stop("ERROR: Too many columns missing! Please check column names in CSV file.") -} - -cat("Proceeding with processing...\n\n") - -# Process each pair of columns -for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values from columns, handling missing columns - vals_A <- if (col_A %in% names(df)) df[[col_A]] else rep(NA, nrow(df)) - vals_B <- if (col_B %in% names(df)) df[[col_B]] else rep(NA, nrow(df)) - - # Coalesce: take value from vals_A if present, otherwise from vals_B - combined <- ifelse(!is.na(vals_A) & vals_A != "", - vals_A, - vals_B) - - # Recode to numeric - df[[target_col]] <- recode_likert(combined) - - # Print progress - cat("Processed:", target_col, "\n") -} - -# ============= VERIFY TARGET COLUMNS WERE CREATED ============= -cat("\n\n=== VERIFYING TARGET COLUMNS ===\n\n") - -# Get updated column names -df_cols_after <- trimws(names(df)) - -# Check which target columns exist -existing_targets <- target_cols[target_cols %in% df_cols_after] -missing_targets <- target_cols[!target_cols %in% df_cols_after] - -cat("Target Columns:\n") -cat(" Expected: 60 columns\n") -cat(" Created:", length(existing_targets), "columns\n") -cat(" Missing:", length(missing_targets), "columns\n") - -if (length(missing_targets) > 0) { - cat("\n WARNING: The following target columns were NOT created:\n") - for (col in missing_targets) { - cat(" -", col, "\n") - } - stop("\nERROR: Not all target columns were created successfully!") -} else { - cat("\n SUCCESS: All 60 target columns created successfully!\n") -} - -cat("\n=== END VERIFICATION ===\n\n") - - -# ============= QUALITY ASSURANCE: RANDOM ROW CHECK ============= -# This function can be run multiple times to check different random rows - -qa_check_random_row <- function() { - # Pick a random row - random_row <- sample(1:nrow(df), 1) - - cat("\n========================================\n") - cat("QA CHECK: Random Row #", random_row, "\n") - cat("========================================\n\n") - - # Check each of the 60 pairs - for (i in 1:60) { - col_A <- source_cols_A[i] - col_B <- source_cols_B[i] - target_col <- target_cols[i] - - # Get values - val_A <- if (col_A %in% names(df)) df[random_row, col_A] else "" - val_B <- if (col_B %in% names(df)) df[random_row, col_B] else "" - target_val <- df[random_row, target_col] - - # Determine which source had the value - has_val_A <- !is.na(val_A) && val_A != "" - has_val_B <- !is.na(val_B) && val_B != "" - - if (has_val_A) { - source_used <- "A" - original_text <- val_A - } else if (has_val_B) { - source_used <- "B" - original_text <- val_B - } else { - source_used <- "NONE" - original_text <- "(empty)" - } - - # Print the info - cat(sprintf("Pair %2d:\n", i)) - cat(sprintf(" Source A: %-30s\n", col_A)) - cat(sprintf(" Source B: %-30s\n", col_B)) - cat(sprintf(" Target: %-30s\n", target_col)) - cat(sprintf(" Value found in: Source %s\n", source_used)) - cat(sprintf(" Original text: '%s'\n", original_text)) - cat(sprintf(" Numeric value: %s\n", ifelse(is.na(target_val), "NA", as.character(target_val)))) - cat("\n") - } - - cat("========================================\n") - cat("END QA CHECK\n") - cat("========================================\n\n") -} - -# Run QA check on first random row -cat("\n\n") -qa_check_random_row() - -# Instructions for running additional checks -cat("\n") -cat("*** TO CHECK ANOTHER RANDOM ROW ***\n") -cat("Run this command in R console:\n") -cat(" qa_check_random_row()\n") -cat("\n") - - -# Save the modified dataframe back to CSV -# na="" writes NA values as empty cells instead of "NA" text -write.csv(df, "eohi2.csv", row.names = FALSE, na = "") - -cat("\nProcessing complete! 60 new columns added to eohi2.csv\n") - diff --git a/.history/eohi2/reliability - ehi_20251028173023.r b/.history/eohi2/reliability - ehi_20251028173023.r deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi2/reliability - ehi_20251028173024.r b/.history/eohi2/reliability - ehi_20251028173024.r deleted file mode 100644 index dbb1cb8..0000000 --- a/.history/eohi2/reliability - ehi_20251028173024.r +++ /dev/null @@ -1,9 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028173102.r b/.history/eohi2/reliability - ehi_20251028173102.r deleted file mode 100644 index 7873ec3..0000000 --- a/.history/eohi2/reliability - ehi_20251028173102.r +++ /dev/null @@ -1,67 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -cat("Missing values per variable:\n") -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -cat("\nSample size for reliability analysis:", nrow(reliability_data), "\n") - -# Cronbach's Alpha -cat("\n=== CRONBACH'S ALPHA ===\n") -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Alternative reliability measures -cat("\n=== ADDITIONAL RELIABILITY MEASURES ===\n") - -# Split-half reliability -split_half <- splitHalf(reliability_data) -cat("Split-half reliability:\n") -print(split_half) - -# McDonald's Omega -omega_result <- omega(reliability_data) -cat("\nMcDonald's Omega:\n") -print(omega_result) - -# Inter-item correlations -cat("\n=== INTER-ITEM CORRELATIONS ===\n") -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -cat("\n=== DESCRIPTIVE STATISTICS ===\n") -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Skewness = round(apply(reliability_data, 2, skew, na.rm = TRUE), 5), - Kurtosis = round(apply(reliability_data, 2, kurtosi, na.rm = TRUE), 5) -) - -cat("\n=== SUMMARY TABLE ===\n") -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) -cat("\nResults saved to 'reliability_summary_ehi.csv'\n") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028173106.r b/.history/eohi2/reliability - ehi_20251028173106.r deleted file mode 100644 index 7873ec3..0000000 --- a/.history/eohi2/reliability - ehi_20251028173106.r +++ /dev/null @@ -1,67 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -cat("Missing values per variable:\n") -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -cat("\nSample size for reliability analysis:", nrow(reliability_data), "\n") - -# Cronbach's Alpha -cat("\n=== CRONBACH'S ALPHA ===\n") -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Alternative reliability measures -cat("\n=== ADDITIONAL RELIABILITY MEASURES ===\n") - -# Split-half reliability -split_half <- splitHalf(reliability_data) -cat("Split-half reliability:\n") -print(split_half) - -# McDonald's Omega -omega_result <- omega(reliability_data) -cat("\nMcDonald's Omega:\n") -print(omega_result) - -# Inter-item correlations -cat("\n=== INTER-ITEM CORRELATIONS ===\n") -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -cat("\n=== DESCRIPTIVE STATISTICS ===\n") -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Skewness = round(apply(reliability_data, 2, skew, na.rm = TRUE), 5), - Kurtosis = round(apply(reliability_data, 2, kurtosi, na.rm = TRUE), 5) -) - -cat("\n=== SUMMARY TABLE ===\n") -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) -cat("\nResults saved to 'reliability_summary_ehi.csv'\n") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028173223.r b/.history/eohi2/reliability - ehi_20251028173223.r deleted file mode 100644 index 5960a75..0000000 --- a/.history/eohi2/reliability - ehi_20251028173223.r +++ /dev/null @@ -1,67 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -cat("Missing values per variable:\n") -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -cat("\nSample size for reliability analysis:", nrow(reliability_data), "\n") - -# Cronbach's Alpha -cat("\n=== CRONBACH'S ALPHA ===\n") -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Alternative reliability measures -cat("\n=== ADDITIONAL RELIABILITY MEASURES ===\n") - -# Split-half reliability -split_half <- splitHalf(reliability_data) -cat("Split-half reliability:\n") -print(split_half) - -# Alpha if item dropped -cat("\nAlpha if item dropped:\n") -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cat("\n=== INTER-ITEM CORRELATIONS ===\n") -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -cat("\n=== DESCRIPTIVE STATISTICS ===\n") -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Skewness = round(apply(reliability_data, 2, skew, na.rm = TRUE), 5), - Kurtosis = round(apply(reliability_data, 2, kurtosi, na.rm = TRUE), 5) -) - -cat("\n=== SUMMARY TABLE ===\n") -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) -cat("\nResults saved to 'reliability_summary_ehi.csv'\n") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028173226.r b/.history/eohi2/reliability - ehi_20251028173226.r deleted file mode 100644 index 5960a75..0000000 --- a/.history/eohi2/reliability - ehi_20251028173226.r +++ /dev/null @@ -1,67 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -cat("Missing values per variable:\n") -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -cat("\nSample size for reliability analysis:", nrow(reliability_data), "\n") - -# Cronbach's Alpha -cat("\n=== CRONBACH'S ALPHA ===\n") -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Alternative reliability measures -cat("\n=== ADDITIONAL RELIABILITY MEASURES ===\n") - -# Split-half reliability -split_half <- splitHalf(reliability_data) -cat("Split-half reliability:\n") -print(split_half) - -# Alpha if item dropped -cat("\nAlpha if item dropped:\n") -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cat("\n=== INTER-ITEM CORRELATIONS ===\n") -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -cat("\n=== DESCRIPTIVE STATISTICS ===\n") -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Skewness = round(apply(reliability_data, 2, skew, na.rm = TRUE), 5), - Kurtosis = round(apply(reliability_data, 2, kurtosi, na.rm = TRUE), 5) -) - -cat("\n=== SUMMARY TABLE ===\n") -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) -cat("\nResults saved to 'reliability_summary_ehi.csv'\n") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028173245.r b/.history/eohi2/reliability - ehi_20251028173245.r deleted file mode 100644 index b2ec649..0000000 --- a/.history/eohi2/reliability - ehi_20251028173245.r +++ /dev/null @@ -1,56 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Skewness = round(apply(reliability_data, 2, skew, na.rm = TRUE), 5), - Kurtosis = round(apply(reliability_data, 2, kurtosi, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028173247.r b/.history/eohi2/reliability - ehi_20251028173247.r deleted file mode 100644 index b2ec649..0000000 --- a/.history/eohi2/reliability - ehi_20251028173247.r +++ /dev/null @@ -1,56 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Skewness = round(apply(reliability_data, 2, skew, na.rm = TRUE), 5), - Kurtosis = round(apply(reliability_data, 2, kurtosi, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028173249.r b/.history/eohi2/reliability - ehi_20251028173249.r deleted file mode 100644 index b2ec649..0000000 --- a/.history/eohi2/reliability - ehi_20251028173249.r +++ /dev/null @@ -1,56 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Skewness = round(apply(reliability_data, 2, skew, na.rm = TRUE), 5), - Kurtosis = round(apply(reliability_data, 2, kurtosi, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028174139.r b/.history/eohi2/reliability - ehi_20251028174139.r deleted file mode 100644 index 80cce25..0000000 --- a/.history/eohi2/reliability - ehi_20251028174139.r +++ /dev/null @@ -1,84 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Skewness = round(apply(reliability_data, 2, skew, na.rm = TRUE), 5), - Kurtosis = round(apply(reliability_data, 2, kurtosi, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) - -# Create HTML output -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - "
", capture.output(print(alpha_result)), "
", - - "

Split-Half Reliability

", - "
", capture.output(print(split_half)), "
", - - "

Alpha if Item Dropped

", - "
", capture.output(print(alpha_dropped$alpha.drop)), "
", - - "

Inter-Item Correlations

", - "
", capture.output(print(round(cor_matrix, 5))), "
", - - "

Descriptive Statistics

", - "
", capture.output(print(desc_stats)), "
", - - "

Summary Table

", - "
", capture.output(print(summary_table)), "
", - - "" -) - -writeLines(html_output, "reliability_analysis_ehi.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028174146.r b/.history/eohi2/reliability - ehi_20251028174146.r deleted file mode 100644 index 80cce25..0000000 --- a/.history/eohi2/reliability - ehi_20251028174146.r +++ /dev/null @@ -1,84 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Skewness = round(apply(reliability_data, 2, skew, na.rm = TRUE), 5), - Kurtosis = round(apply(reliability_data, 2, kurtosi, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) - -# Create HTML output -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - "
", capture.output(print(alpha_result)), "
", - - "

Split-Half Reliability

", - "
", capture.output(print(split_half)), "
", - - "

Alpha if Item Dropped

", - "
", capture.output(print(alpha_dropped$alpha.drop)), "
", - - "

Inter-Item Correlations

", - "
", capture.output(print(round(cor_matrix, 5))), "
", - - "

Descriptive Statistics

", - "
", capture.output(print(desc_stats)), "
", - - "

Summary Table

", - "
", capture.output(print(summary_table)), "
", - - "" -) - -writeLines(html_output, "reliability_analysis_ehi.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028174151.r b/.history/eohi2/reliability - ehi_20251028174151.r deleted file mode 100644 index 41bcaf7..0000000 --- a/.history/eohi2/reliability - ehi_20251028174151.r +++ /dev/null @@ -1,84 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Skewness = round(apply(reliability_data, 2, skew, na.rm = TRUE), 5), - Kurtosis = round(apply(reliability_data, 2, kurtosi, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) - -# Create HTML output -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - "
", capture.output(print(alpha_result)), "
", - - "

Split-Half Reliability

", - "
", capture.output(print(split_half)), "
", - - "

Alpha if Item Dropped

", - "
", capture.output(print(alpha_dropped$alpha.drop)), "
", - - "

Inter-Item Correlations

", - "
", capture.output(print(round(cor_matrix, 5))), "
", - - "

Descriptive Statistics

", - "
", capture.output(print(desc_stats)), "
", - - "

Summary Table

", - "
", capture.output(print(summary_table)), "
", - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028174152.r b/.history/eohi2/reliability - ehi_20251028174152.r deleted file mode 100644 index 41bcaf7..0000000 --- a/.history/eohi2/reliability - ehi_20251028174152.r +++ /dev/null @@ -1,84 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Skewness = round(apply(reliability_data, 2, skew, na.rm = TRUE), 5), - Kurtosis = round(apply(reliability_data, 2, kurtosi, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) - -# Create HTML output -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - "
", capture.output(print(alpha_result)), "
", - - "

Split-Half Reliability

", - "
", capture.output(print(split_half)), "
", - - "

Alpha if Item Dropped

", - "
", capture.output(print(alpha_dropped$alpha.drop)), "
", - - "

Inter-Item Correlations

", - "
", capture.output(print(round(cor_matrix, 5))), "
", - - "

Descriptive Statistics

", - "
", capture.output(print(desc_stats)), "
", - - "

Summary Table

", - "
", capture.output(print(summary_table)), "
", - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028174209.r b/.history/eohi2/reliability - ehi_20251028174209.r deleted file mode 100644 index 41bcaf7..0000000 --- a/.history/eohi2/reliability - ehi_20251028174209.r +++ /dev/null @@ -1,84 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Skewness = round(apply(reliability_data, 2, skew, na.rm = TRUE), 5), - Kurtosis = round(apply(reliability_data, 2, kurtosi, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) - -# Create HTML output -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - "
", capture.output(print(alpha_result)), "
", - - "

Split-Half Reliability

", - "
", capture.output(print(split_half)), "
", - - "

Alpha if Item Dropped

", - "
", capture.output(print(alpha_dropped$alpha.drop)), "
", - - "

Inter-Item Correlations

", - "
", capture.output(print(round(cor_matrix, 5))), "
", - - "

Descriptive Statistics

", - "
", capture.output(print(desc_stats)), "
", - - "

Summary Table

", - "
", capture.output(print(summary_table)), "
", - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028174329.r b/.history/eohi2/reliability - ehi_20251028174329.r deleted file mode 100644 index 7906277..0000000 --- a/.history/eohi2/reliability - ehi_20251028174329.r +++ /dev/null @@ -1,90 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Skewness = round(apply(reliability_data, 2, skew, na.rm = TRUE), 5), - Kurtosis = round(apply(reliability_data, 2, kurtosi, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) - -# Create HTML output -html_output <- sprintf( - "EHI Reliability Analysis -

EHI Reliability Analysis

- -

Cronbach's Alpha

-
%s
- -

Split-Half Reliability

-
%s
- -

Alpha if Item Dropped

-
%s
- -

Inter-Item Correlations

-
%s
- -

Descriptive Statistics

-
%s
- -

Summary Table

-
%s
- - ", - paste(capture.output(print(alpha_result)), collapse = "\n"), - paste(capture.output(print(split_half)), collapse = "\n"), - paste(capture.output(print(alpha_dropped$alpha.drop)), collapse = "\n"), - paste(capture.output(print(round(cor_matrix, 5))), collapse = "\n"), - paste(capture.output(print(desc_stats)), collapse = "\n"), - paste(capture.output(print(summary_table)), collapse = "\n") -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028174333.r b/.history/eohi2/reliability - ehi_20251028174333.r deleted file mode 100644 index 7906277..0000000 --- a/.history/eohi2/reliability - ehi_20251028174333.r +++ /dev/null @@ -1,90 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Skewness = round(apply(reliability_data, 2, skew, na.rm = TRUE), 5), - Kurtosis = round(apply(reliability_data, 2, kurtosi, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) - -# Create HTML output -html_output <- sprintf( - "EHI Reliability Analysis -

EHI Reliability Analysis

- -

Cronbach's Alpha

-
%s
- -

Split-Half Reliability

-
%s
- -

Alpha if Item Dropped

-
%s
- -

Inter-Item Correlations

-
%s
- -

Descriptive Statistics

-
%s
- -

Summary Table

-
%s
- - ", - paste(capture.output(print(alpha_result)), collapse = "\n"), - paste(capture.output(print(split_half)), collapse = "\n"), - paste(capture.output(print(alpha_dropped$alpha.drop)), collapse = "\n"), - paste(capture.output(print(round(cor_matrix, 5))), collapse = "\n"), - paste(capture.output(print(desc_stats)), collapse = "\n"), - paste(capture.output(print(summary_table)), collapse = "\n") -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028174401.r b/.history/eohi2/reliability - ehi_20251028174401.r deleted file mode 100644 index b057e40..0000000 --- a/.history/eohi2/reliability - ehi_20251028174401.r +++ /dev/null @@ -1,80 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Skewness = round(apply(reliability_data, 2, skew, na.rm = TRUE), 5), - Kurtosis = round(apply(reliability_data, 2, kurtosi, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) - -# Create HTML output using sink() -sink("temp_output.txt") -print(alpha_result) -print(split_half) -print(alpha_dropped$alpha.drop) -print(round(cor_matrix, 5)) -print(desc_stats) -print(summary_table) -sink() - -# Read the captured output -captured_output <- readLines("temp_output.txt") -file.remove("temp_output.txt") - -html_output <- sprintf( - "EHI Reliability Analysis -

EHI Reliability Analysis

-
%s
- ", - paste(captured_output, collapse = "\n") -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028174407.r b/.history/eohi2/reliability - ehi_20251028174407.r deleted file mode 100644 index b057e40..0000000 --- a/.history/eohi2/reliability - ehi_20251028174407.r +++ /dev/null @@ -1,80 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Skewness = round(apply(reliability_data, 2, skew, na.rm = TRUE), 5), - Kurtosis = round(apply(reliability_data, 2, kurtosi, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) - -# Create HTML output using sink() -sink("temp_output.txt") -print(alpha_result) -print(split_half) -print(alpha_dropped$alpha.drop) -print(round(cor_matrix, 5)) -print(desc_stats) -print(summary_table) -sink() - -# Read the captured output -captured_output <- readLines("temp_output.txt") -file.remove("temp_output.txt") - -html_output <- sprintf( - "EHI Reliability Analysis -

EHI Reliability Analysis

-
%s
- ", - paste(captured_output, collapse = "\n") -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028174412.r b/.history/eohi2/reliability - ehi_20251028174412.r deleted file mode 100644 index b057e40..0000000 --- a/.history/eohi2/reliability - ehi_20251028174412.r +++ /dev/null @@ -1,80 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Skewness = round(apply(reliability_data, 2, skew, na.rm = TRUE), 5), - Kurtosis = round(apply(reliability_data, 2, kurtosi, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) - -# Create HTML output using sink() -sink("temp_output.txt") -print(alpha_result) -print(split_half) -print(alpha_dropped$alpha.drop) -print(round(cor_matrix, 5)) -print(desc_stats) -print(summary_table) -sink() - -# Read the captured output -captured_output <- readLines("temp_output.txt") -file.remove("temp_output.txt") - -html_output <- sprintf( - "EHI Reliability Analysis -

EHI Reliability Analysis

-
%s
- ", - paste(captured_output, collapse = "\n") -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028174501.r b/.history/eohi2/reliability - ehi_20251028174501.r deleted file mode 100644 index abebca4..0000000 --- a/.history/eohi2/reliability - ehi_20251028174501.r +++ /dev/null @@ -1,87 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Skewness = round(apply(reliability_data, 2, skew, na.rm = TRUE), 5), - Kurtosis = round(apply(reliability_data, 2, kurtosi, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) - -# Create HTML output using knitr -library(knitr) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", split_half$maxrb, "

", - "

Average split half reliability: ", split_half$avrb, "

", - - "

Alpha if Item Dropped

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Inter-Item Correlations

", - kable(round(cor_matrix, 5), format = "html"), - - "

Descriptive Statistics

", - kable(desc_stats, format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028174503.r b/.history/eohi2/reliability - ehi_20251028174503.r deleted file mode 100644 index abebca4..0000000 --- a/.history/eohi2/reliability - ehi_20251028174503.r +++ /dev/null @@ -1,87 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Skewness = round(apply(reliability_data, 2, skew, na.rm = TRUE), 5), - Kurtosis = round(apply(reliability_data, 2, kurtosi, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) - -# Create HTML output using knitr -library(knitr) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", split_half$maxrb, "

", - "

Average split half reliability: ", split_half$avrb, "

", - - "

Alpha if Item Dropped

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Inter-Item Correlations

", - kable(round(cor_matrix, 5), format = "html"), - - "

Descriptive Statistics

", - kable(desc_stats, format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028174701.r b/.history/eohi2/reliability - ehi_20251028174701.r deleted file mode 100644 index abebca4..0000000 --- a/.history/eohi2/reliability - ehi_20251028174701.r +++ /dev/null @@ -1,87 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Skewness = round(apply(reliability_data, 2, skew, na.rm = TRUE), 5), - Kurtosis = round(apply(reliability_data, 2, kurtosi, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) - -# Create HTML output using knitr -library(knitr) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", split_half$maxrb, "

", - "

Average split half reliability: ", split_half$avrb, "

", - - "

Alpha if Item Dropped

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Inter-Item Correlations

", - kable(round(cor_matrix, 5), format = "html"), - - "

Descriptive Statistics

", - kable(desc_stats, format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028174922.r b/.history/eohi2/reliability - ehi_20251028174922.r deleted file mode 100644 index a59e403..0000000 --- a/.history/eohi2/reliability - ehi_20251028174922.r +++ /dev/null @@ -1,88 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Skewness = round(apply(reliability_data, 2, skew, na.rm = TRUE), 5), - Kurtosis = round(apply(reliability_data, 2, kurtosi, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) - -# Create HTML output using knitr -library(knitr) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - "

Average split half reliability: ", round(split_half$avrb, 5), "

", - "

Minimum split half reliability: ", round(split_half$minrb, 5), "

", - - "

Alpha if Item Dropped

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Inter-Item Correlations

", - kable(round(cor_matrix, 5), format = "html"), - - "

Descriptive Statistics

", - kable(desc_stats, format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028174926.r b/.history/eohi2/reliability - ehi_20251028174926.r deleted file mode 100644 index a59e403..0000000 --- a/.history/eohi2/reliability - ehi_20251028174926.r +++ /dev/null @@ -1,88 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Skewness = round(apply(reliability_data, 2, skew, na.rm = TRUE), 5), - Kurtosis = round(apply(reliability_data, 2, kurtosi, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) - -# Create HTML output using knitr -library(knitr) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - "

Average split half reliability: ", round(split_half$avrb, 5), "

", - "

Minimum split half reliability: ", round(split_half$minrb, 5), "

", - - "

Alpha if Item Dropped

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Inter-Item Correlations

", - kable(round(cor_matrix, 5), format = "html"), - - "

Descriptive Statistics

", - kable(desc_stats, format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028174958.r b/.history/eohi2/reliability - ehi_20251028174958.r deleted file mode 100644 index e4001fa..0000000 --- a/.history/eohi2/reliability - ehi_20251028174958.r +++ /dev/null @@ -1,88 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) - -# Create HTML output using knitr -library(knitr) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - "

Average split half reliability: ", round(split_half$avrb, 5), "

", - "

Minimum split half reliability: ", round(split_half$minrb, 5), "

", - - "

Alpha if Item Dropped

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Inter-Item Correlations

", - kable(round(cor_matrix, 5), format = "html"), - - "

Descriptive Statistics

", - kable(desc_stats, format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028175001.r b/.history/eohi2/reliability - ehi_20251028175001.r deleted file mode 100644 index e4001fa..0000000 --- a/.history/eohi2/reliability - ehi_20251028175001.r +++ /dev/null @@ -1,88 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) - -# Create HTML output using knitr -library(knitr) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - "

Average split half reliability: ", round(split_half$avrb, 5), "

", - "

Minimum split half reliability: ", round(split_half$minrb, 5), "

", - - "

Alpha if Item Dropped

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Inter-Item Correlations

", - kable(round(cor_matrix, 5), format = "html"), - - "

Descriptive Statistics

", - kable(desc_stats, format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028175110.r b/.history/eohi2/reliability - ehi_20251028175110.r deleted file mode 100644 index e4001fa..0000000 --- a/.history/eohi2/reliability - ehi_20251028175110.r +++ /dev/null @@ -1,88 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) - -# Create HTML output using knitr -library(knitr) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - "

Average split half reliability: ", round(split_half$avrb, 5), "

", - "

Minimum split half reliability: ", round(split_half$minrb, 5), "

", - - "

Alpha if Item Dropped

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Inter-Item Correlations

", - kable(round(cor_matrix, 5), format = "html"), - - "

Descriptive Statistics

", - kable(desc_stats, format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028175138.r b/.history/eohi2/reliability - ehi_20251028175138.r deleted file mode 100644 index 091959a..0000000 --- a/.history/eohi2/reliability - ehi_20251028175138.r +++ /dev/null @@ -1,88 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) - -# Create HTML output using knitr -library(knitr) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - "

Average split half reliability: ", round(as.numeric(split_half$avrb), 5), "

", - "

Minimum split half reliability: ", round(split_half$minrb, 5), "

", - - "

Alpha if Item Dropped

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Inter-Item Correlations

", - kable(round(cor_matrix, 5), format = "html"), - - "

Descriptive Statistics

", - kable(desc_stats, format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028175140.r b/.history/eohi2/reliability - ehi_20251028175140.r deleted file mode 100644 index 091959a..0000000 --- a/.history/eohi2/reliability - ehi_20251028175140.r +++ /dev/null @@ -1,88 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) - -# Create HTML output using knitr -library(knitr) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - "

Average split half reliability: ", round(as.numeric(split_half$avrb), 5), "

", - "

Minimum split half reliability: ", round(split_half$minrb, 5), "

", - - "

Alpha if Item Dropped

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Inter-Item Correlations

", - kable(round(cor_matrix, 5), format = "html"), - - "

Descriptive Statistics

", - kable(desc_stats, format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028175150.r b/.history/eohi2/reliability - ehi_20251028175150.r deleted file mode 100644 index 2fa62e7..0000000 --- a/.history/eohi2/reliability - ehi_20251028175150.r +++ /dev/null @@ -1,87 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) - -# Create HTML output using knitr -library(knitr) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - "

Minimum split half reliability: ", round(split_half$minrb, 5), "

", - - "

Alpha if Item Dropped

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Inter-Item Correlations

", - kable(round(cor_matrix, 5), format = "html"), - - "

Descriptive Statistics

", - kable(desc_stats, format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028175152.r b/.history/eohi2/reliability - ehi_20251028175152.r deleted file mode 100644 index 2fa62e7..0000000 --- a/.history/eohi2/reliability - ehi_20251028175152.r +++ /dev/null @@ -1,87 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) - -# Create HTML output using knitr -library(knitr) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - "

Minimum split half reliability: ", round(split_half$minrb, 5), "

", - - "

Alpha if Item Dropped

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Inter-Item Correlations

", - kable(round(cor_matrix, 5), format = "html"), - - "

Descriptive Statistics

", - kable(desc_stats, format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028175211.r b/.history/eohi2/reliability - ehi_20251028175211.r deleted file mode 100644 index 9ba5348..0000000 --- a/.history/eohi2/reliability - ehi_20251028175211.r +++ /dev/null @@ -1,86 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) - -# Create HTML output using knitr -library(knitr) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - - "

Alpha if Item Dropped

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Inter-Item Correlations

", - kable(round(cor_matrix, 5), format = "html"), - - "

Descriptive Statistics

", - kable(desc_stats, format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028175214.r b/.history/eohi2/reliability - ehi_20251028175214.r deleted file mode 100644 index 9ba5348..0000000 --- a/.history/eohi2/reliability - ehi_20251028175214.r +++ /dev/null @@ -1,86 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Select the 4 variables for reliability analysis -reliability_vars <- df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Check for missing values -print(colSums(is.na(reliability_vars))) - -# Remove rows with any missing values for reliability analysis -reliability_data <- reliability_vars[complete.cases(reliability_vars), ] - -print(nrow(reliability_data)) - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) -print(alpha_result) - -# Split-half reliability -split_half <- splitHalf(reliability_data) -print(split_half) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) -print(alpha_dropped$alpha.drop) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") -print(round(cor_matrix, 5)) - -# Descriptive statistics -desc_stats <- describe(reliability_data) -print(desc_stats) - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -print(summary_table) - -# Save results -write.csv(summary_table, "reliability_summary_ehi.csv", row.names = FALSE) - -# Create HTML output using knitr -library(knitr) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - - "

Alpha if Item Dropped

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Inter-Item Correlations

", - kable(round(cor_matrix, 5), format = "html"), - - "

Descriptive Statistics

", - kable(desc_stats, format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028175223.r b/.history/eohi2/reliability - ehi_20251028175223.r deleted file mode 100644 index 80cb31d..0000000 --- a/.history/eohi2/reliability - ehi_20251028175223.r +++ /dev/null @@ -1,61 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(knitr) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")]), - c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - - "

Alpha if Item Dropped

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Inter-Item Correlations

", - kable(round(cor_matrix, 5), format = "html"), - - "

Descriptive Statistics

", - kable(desc_stats, format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028175227.r b/.history/eohi2/reliability - ehi_20251028175227.r deleted file mode 100644 index 22e1ff0..0000000 --- a/.history/eohi2/reliability - ehi_20251028175227.r +++ /dev/null @@ -1,58 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(knitr) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")]), - c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - - "

Alpha if Item Dropped

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Inter-Item Correlations

", - kable(round(cor_matrix, 5), format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028175230.r b/.history/eohi2/reliability - ehi_20251028175230.r deleted file mode 100644 index 22e1ff0..0000000 --- a/.history/eohi2/reliability - ehi_20251028175230.r +++ /dev/null @@ -1,58 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(knitr) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")]), - c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - - "

Alpha if Item Dropped

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Inter-Item Correlations

", - kable(round(cor_matrix, 5), format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028175322.r b/.history/eohi2/reliability - ehi_20251028175322.r deleted file mode 100644 index 22e1ff0..0000000 --- a/.history/eohi2/reliability - ehi_20251028175322.r +++ /dev/null @@ -1,58 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(knitr) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")]), - c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - - "

Alpha if Item Dropped

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Inter-Item Correlations

", - kable(round(cor_matrix, 5), format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028175530.r b/.history/eohi2/reliability - ehi_20251028175530.r deleted file mode 100644 index 6e64796..0000000 --- a/.history/eohi2/reliability - ehi_20251028175530.r +++ /dev/null @@ -1,61 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(knitr) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")]), - c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - - "

Item-Level Statistics

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Corrected Item-Total Correlations

", - kable(alpha_result$item.stats, format = "html"), - - "

Inter-Item Correlations

", - kable(round(cor_matrix, 5), format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028175535.r b/.history/eohi2/reliability - ehi_20251028175535.r deleted file mode 100644 index 6e64796..0000000 --- a/.history/eohi2/reliability - ehi_20251028175535.r +++ /dev/null @@ -1,61 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(knitr) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")]), - c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - - "

Item-Level Statistics

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Corrected Item-Total Correlations

", - kable(alpha_result$item.stats, format = "html"), - - "

Inter-Item Correlations

", - kable(round(cor_matrix, 5), format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028175559.r b/.history/eohi2/reliability - ehi_20251028175559.r deleted file mode 100644 index 6e64796..0000000 --- a/.history/eohi2/reliability - ehi_20251028175559.r +++ /dev/null @@ -1,61 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(knitr) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")]), - c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - - "

Item-Level Statistics

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Corrected Item-Total Correlations

", - kable(alpha_result$item.stats, format = "html"), - - "

Inter-Item Correlations

", - kable(round(cor_matrix, 5), format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028175617.r b/.history/eohi2/reliability - ehi_20251028175617.r deleted file mode 100644 index c731daa..0000000 --- a/.history/eohi2/reliability - ehi_20251028175617.r +++ /dev/null @@ -1,58 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(knitr) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")]), - c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - - "

Item-Level Statistics

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Corrected Item-Total Correlations

", - kable(alpha_result$item.stats, format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028175618.r b/.history/eohi2/reliability - ehi_20251028175618.r deleted file mode 100644 index c731daa..0000000 --- a/.history/eohi2/reliability - ehi_20251028175618.r +++ /dev/null @@ -1,58 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(knitr) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")]), - c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - - "

Item-Level Statistics

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Corrected Item-Total Correlations

", - kable(alpha_result$item.stats, format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028175644.r b/.history/eohi2/reliability - ehi_20251028175644.r deleted file mode 100644 index 36d75a8..0000000 --- a/.history/eohi2/reliability - ehi_20251028175644.r +++ /dev/null @@ -1,58 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(knitr) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")]), - c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - - "

Item-Level Statistics

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Item-Level Statistics

", - kable(alpha_result$item.stats, format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028175648.r b/.history/eohi2/reliability - ehi_20251028175648.r deleted file mode 100644 index 36d75a8..0000000 --- a/.history/eohi2/reliability - ehi_20251028175648.r +++ /dev/null @@ -1,58 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(knitr) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")]), - c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - - "

Item-Level Statistics

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Item-Level Statistics

", - kable(alpha_result$item.stats, format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028175651.r b/.history/eohi2/reliability - ehi_20251028175651.r deleted file mode 100644 index 36d75a8..0000000 --- a/.history/eohi2/reliability - ehi_20251028175651.r +++ /dev/null @@ -1,58 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(knitr) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")]), - c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - - "

Item-Level Statistics

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Item-Level Statistics

", - kable(alpha_result$item.stats, format = "html"), - - "

Summary Table

", - kable(summary_table, format = "html"), - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028180009.r b/.history/eohi2/reliability - ehi_20251028180009.r deleted file mode 100644 index 84c0c15..0000000 --- a/.history/eohi2/reliability - ehi_20251028180009.r +++ /dev/null @@ -1,56 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(knitr) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")]), - c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - - "

Item-Level Statistics

", - kable(alpha_dropped$alpha.drop, format = "html"), - - "

Item-Level Statistics

", - kable(alpha_result$item.stats, format = "html"), - - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028180012.r b/.history/eohi2/reliability - ehi_20251028180012.r deleted file mode 100644 index 6ee7cdf..0000000 --- a/.history/eohi2/reliability - ehi_20251028180012.r +++ /dev/null @@ -1,53 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(knitr) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")]), - c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - - "

Item-Level Statistics

", - kable(alpha_result$item.stats, format = "html"), - - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028180015.r b/.history/eohi2/reliability - ehi_20251028180015.r deleted file mode 100644 index 6ee7cdf..0000000 --- a/.history/eohi2/reliability - ehi_20251028180015.r +++ /dev/null @@ -1,53 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(knitr) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")]), - c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - - "

Item-Level Statistics

", - kable(alpha_result$item.stats, format = "html"), - - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028180017.r b/.history/eohi2/reliability - ehi_20251028180017.r deleted file mode 100644 index 6ee7cdf..0000000 --- a/.history/eohi2/reliability - ehi_20251028180017.r +++ /dev/null @@ -1,53 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(knitr) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")]), - c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - - "

Item-Level Statistics

", - kable(alpha_result$item.stats, format = "html"), - - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028180125.r b/.history/eohi2/reliability - ehi_20251028180125.r deleted file mode 100644 index d059220..0000000 --- a/.history/eohi2/reliability - ehi_20251028180125.r +++ /dev/null @@ -1,56 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(knitr) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")]), - c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - - "

Item-Level Statistics

", - kable(alpha_result$item.stats, format = "html"), - - "

Alpha if Item Dropped

", - kable(alpha_dropped$alpha.drop, format = "html"), - - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028180127.r b/.history/eohi2/reliability - ehi_20251028180127.r deleted file mode 100644 index d059220..0000000 --- a/.history/eohi2/reliability - ehi_20251028180127.r +++ /dev/null @@ -1,56 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(knitr) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")]), - c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - - "

Item-Level Statistics

", - kable(alpha_result$item.stats, format = "html"), - - "

Alpha if Item Dropped

", - kable(alpha_dropped$alpha.drop, format = "html"), - - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability - ehi_20251028180136.r b/.history/eohi2/reliability - ehi_20251028180136.r deleted file mode 100644 index d059220..0000000 --- a/.history/eohi2/reliability - ehi_20251028180136.r +++ /dev/null @@ -1,56 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(knitr) - -# Select the 4 variables for reliability analysis -reliability_data <- df[complete.cases(df[, c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")]), - c("ehiDGEN_5_mean", "ehiDGEN_10_mean", "ehi5_global_mean", "ehi10_global_mean")] - -# Cronbach's Alpha -alpha_result <- alpha(reliability_data) - -# Split-half reliability -split_half <- splitHalf(reliability_data) - -# Alpha if item dropped -alpha_dropped <- alpha(reliability_data, check.keys = TRUE) - -# Inter-item correlations -cor_matrix <- cor(reliability_data, use = "complete.obs") - -# Create a summary table -summary_table <- data.frame( - Variable = names(reliability_data), - n = nrow(reliability_data), - Mean = round(colMeans(reliability_data, na.rm = TRUE), 5), - SD = round(apply(reliability_data, 2, sd, na.rm = TRUE), 5), - Min = round(apply(reliability_data, 2, min, na.rm = TRUE), 5), - Max = round(apply(reliability_data, 2, max, na.rm = TRUE), 5), - Median = round(apply(reliability_data, 2, median, na.rm = TRUE), 5) -) - -html_output <- paste0( - "EHI Reliability Analysis", - "

EHI Reliability Analysis

", - - "

Cronbach's Alpha

", - kable(alpha_result$total, format = "html"), - - "

Split-Half Reliability

", - "

Maximum split half reliability: ", round(split_half$maxrb, 5), "

", - - "

Item-Level Statistics

", - kable(alpha_result$item.stats, format = "html"), - - "

Alpha if Item Dropped

", - kable(alpha_dropped$alpha.drop, format = "html"), - - - "" -) - -writeLines(html_output, "EHI reliability.html") \ No newline at end of file diff --git a/.history/eohi2/reliability analysis_20251027165337.r b/.history/eohi2/reliability analysis_20251027165337.r deleted file mode 100644 index e69de29..0000000 diff --git a/.history/eohi2/reliability analysis_20251027165338.r b/.history/eohi2/reliability analysis_20251027165338.r deleted file mode 100644 index 51562a9..0000000 --- a/.history/eohi2/reliability analysis_20251027165338.r +++ /dev/null @@ -1,7 +0,0 @@ -options(scipen = 999) - -library(dplyr) - -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -df <- read.csv("eohi2.csv") \ No newline at end of file diff --git a/.history/eohi2/reliability analysis_20251028141004.r b/.history/eohi2/reliability analysis_20251028141004.r deleted file mode 100644 index a47b227..0000000 --- a/.history/eohi2/reliability analysis_20251028141004.r +++ /dev/null @@ -1,90 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") - -library(psych) -library(dplyr) -library(knitr) - -# variable sets -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_tv", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_tv", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_tv", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_tv", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# reliability -reliability_results <- data.frame(Scale=character(), Alpha=double(), Omega=double(), stringsAsFactors=FALSE) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - if(!all(vars %in% names(df))) next - dat <- df[, vars] - alpha_val <- tryCatch({psych::alpha(dat)$total$raw_alpha}, error=function(e) NA) - omega_val <- tryCatch({psych::omega(dat, nfactors=1)$omega.tot}, error=function(e) NA) - reliability_results <- rbind(reliability_results, data.frame(Scale=scale_name, Alpha=alpha_val, Omega=omega_val)) -} - -print(kable(reliability_results, digits=3, caption="Internal Consistency (Alpha & Omega)")) - -# ICC test-retest -df$pref_present <- rowMeans(df[, present_pref_vars], na.rm=TRUE) -df$pref_past5 <- rowMeans(df[, past_5_pref_vars], na.rm=TRUE) -df$pref_past10 <- rowMeans(df[, past_10_pref_vars], na.rm=TRUE) -df$pref_fut5 <- rowMeans(df[, fut_5_pref_vars], na.rm=TRUE) -df$pref_fut10 <- rowMeans(df[, fut_10_pref_vars], na.rm=TRUE) -pref_mat <- df[, c("pref_present","pref_past5","pref_past10","pref_fut5","pref_fut10")] - -df$pers_present <- rowMeans(df[, present_pers_vars], na.rm=TRUE) -df$pers_past5 <- rowMeans(df[, past_5_pers_vars], na.rm=TRUE) -df$pers_past10 <- rowMeans(df[, past_10_pers_vars], na.rm=TRUE) -df$pers_fut5 <- rowMeans(df[, fut_5_pers_vars], na.rm=TRUE) -df$pers_fut10 <- rowMeans(df[, fut_10_pers_vars], na.rm=TRUE) -pers_mat <- df[, c("pers_present","pers_past5","pers_past10","pers_fut5","pers_fut10")] - -df$val_present <- rowMeans(df[, present_val_vars], na.rm=TRUE) -df$val_past5 <- rowMeans(df[, past_5_val_vars], na.rm=TRUE) -df$val_past10 <- rowMeans(df[, past_10_val_vars], na.rm=TRUE) -df$val_fut5 <- rowMeans(df[, fut_5_val_vars], na.rm=TRUE) -df$val_fut10 <- rowMeans(df[, fut_10_val_vars], na.rm=TRUE) -val_mat <- df[, c("val_present","val_past5","val_past10","val_fut5","val_fut10")] - -icc_pref <- psych::ICC(pref_mat) -icc_pers <- psych::ICC(pers_mat) -icc_val <- psych::ICC(val_mat) - -print(kable(icc_pref$results["Single_raters_absolute", , drop=FALSE], caption="Test–Retest ICC: Preferences")) -print(kable(icc_pers$results["Single_raters_absolute", , drop=FALSE], caption="Test–Retest ICC: Personality")) -print(kable(icc_val$results["Single_raters_absolute", , drop=FALSE], caption="Test–Retest ICC: Values")) diff --git a/.history/eohi2/reliability analysis_20251028141504.r b/.history/eohi2/reliability analysis_20251028141504.r deleted file mode 100644 index 3b7fa32..0000000 --- a/.history/eohi2/reliability analysis_20251028141504.r +++ /dev/null @@ -1,130 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") - -library(psych) -library(dplyr) -library(knitr) - -# CORRECTED variable sets with exact names from your data -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Verify all variables exist in dataset -cat("Checking variable availability:\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - missing_vars <- vars[!vars %in% names(df)] - if(length(missing_vars) > 0) { - cat("MISSING in", scale_name, ":", paste(missing_vars, collapse=", "), "\n") - } else { - cat("✓ All variables found for", scale_name, "\n") - } -} - -# Reliability analysis -reliability_results <- data.frame(Scale=character(), Alpha=double(), Omega=double(), stringsAsFactors=FALSE) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - - # Check if all variables exist - missing_vars <- vars[!vars %in% names(df)] - if(length(missing_vars) > 0) { - cat("Skipping", scale_name, "- missing variables:", paste(missing_vars, collapse=", "), "\n") - next - } - - dat <- df[, vars] - - # Calculate Alpha - alpha_val <- tryCatch({ - alpha_result <- psych::alpha(dat, check.keys = TRUE) - alpha_result$total$raw_alpha - }, error = function(e) { - cat("Alpha calculation failed for", scale_name, ":", e$message, "\n") - NA - }) - - # Calculate Omega with proper settings - omega_val <- tryCatch({ - omega_result <- psych::omega(dat, nfactors = 1, check.keys = TRUE) - omega_result$omega.tot - }, error = function(e) { - cat("Omega calculation failed for", scale_name, ":", e$message, "\n") - NA - }) - - reliability_results <- rbind(reliability_results, - data.frame(Scale = scale_name, - Alpha = alpha_val, - Omega = omega_val)) -} - -print(kable(reliability_results, digits = 3, caption = "Internal Consistency (Alpha & Omega)")) - -# Test-retest reliability -df$pref_present <- rowMeans(df[, present_pref_vars], na.rm = TRUE) -df$pref_past5 <- rowMeans(df[, past_5_pref_vars], na.rm = TRUE) -df$pref_past10 <- rowMeans(df[, past_10_pref_vars], na.rm = TRUE) -df$pref_fut5 <- rowMeans(df[, fut_5_pref_vars], na.rm = TRUE) -df$pref_fut10 <- rowMeans(df[, fut_10_pref_vars], na.rm = TRUE) -pref_mat <- df[, c("pref_present", "pref_past5", "pref_past10", "pref_fut5", "pref_fut10")] - -df$pers_present <- rowMeans(df[, present_pers_vars], na.rm = TRUE) -df$pers_past5 <- rowMeans(df[, past_5_pers_vars], na.rm = TRUE) -df$pers_past10 <- rowMeans(df[, past_10_pers_vars], na.rm = TRUE) -df$pers_fut5 <- rowMeans(df[, fut_5_pers_vars], na.rm = TRUE) -df$pers_fut10 <- rowMeans(df[, fut_10_pers_vars], na.rm = TRUE) -pers_mat <- df[, c("pers_present", "pers_past5", "pers_past10", "pers_fut5", "pers_fut10")] - -df$val_present <- rowMeans(df[, present_val_vars], na.rm = TRUE) -df$val_past5 <- rowMeans(df[, past_5_val_vars], na.rm = TRUE) -df$val_past10 <- rowMeans(df[, past_10_val_vars], na.rm = TRUE) -df$val_fut5 <- rowMeans(df[, fut_5_val_vars], na.rm = TRUE) -df$val_fut10 <- rowMeans(df[, fut_10_val_vars], na.rm = TRUE) -val_mat <- df[, c("val_present", "val_past5", "val_past10", "val_fut5", "val_fut10")] - -# Calculate ICC -icc_pref <- psych::ICC(pref_mat) -icc_pers <- psych::ICC(pers_mat) -icc_val <- psych::ICC(val_mat) - -print(kable(icc_pref$results["Single_raters_absolute", , drop = FALSE], caption = "Test–Retest ICC: Preferences")) -print(kable(icc_pers$results["Single_raters_absolute", , drop = FALSE], caption = "Test–Retest ICC: Personality")) -print(kable(icc_val$results["Single_raters_absolute", , drop = FALSE], caption = "Test–Retest ICC: Values")) \ No newline at end of file diff --git a/.history/eohi2/reliability analysis_20251028144222.r b/.history/eohi2/reliability analysis_20251028144222.r deleted file mode 100644 index cbbe5d9..0000000 --- a/.history/eohi2/reliability analysis_20251028144222.r +++ /dev/null @@ -1,87 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics (can be suppressed) - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } else { - cat("\n", scale_name, ": Not enough items for alpha analysis.\n") - } -} - -# Summary table for reporting -knitr::kable(summary_tbl) diff --git a/.history/eohi2/reliability analysis_20251028151252.r b/.history/eohi2/reliability analysis_20251028151252.r deleted file mode 100644 index 32ec68b..0000000 --- a/.history/eohi2/reliability analysis_20251028151252.r +++ /dev/null @@ -1,121 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics (can be suppressed) - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } else { - cat("\n", scale_name, ": Not enough items for alpha analysis.\n") - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Summary table for reporting -cat("\n=== SUMMARY TABLE ===\n") -knitr::kable(summary_tbl) - -cat("\n=== ALPHA IF DROPPED TABLE ===\n") -knitr::kable(alpha_dropped_tbl) diff --git a/.history/eohi2/reliability analysis_20251028151259.r b/.history/eohi2/reliability analysis_20251028151259.r deleted file mode 100644 index 3191324..0000000 --- a/.history/eohi2/reliability analysis_20251028151259.r +++ /dev/null @@ -1,129 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics (can be suppressed) - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } else { - cat("\n", scale_name, ": Not enough items for alpha analysis.\n") - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Summary table for reporting -cat("\n=== SUMMARY TABLE ===\n") -knitr::kable(summary_tbl) - -cat("\n=== ALPHA IF DROPPED TABLE ===\n") -knitr::kable(alpha_dropped_tbl) - -# Export tables to CSV -write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) - -cat("\nTables exported to CSV files:\n") -cat("- reliability_summary_table.csv\n") -cat("- alpha_if_dropped_table.csv\n") diff --git a/.history/eohi2/reliability analysis_20251028151305.r b/.history/eohi2/reliability analysis_20251028151305.r deleted file mode 100644 index 3191324..0000000 --- a/.history/eohi2/reliability analysis_20251028151305.r +++ /dev/null @@ -1,129 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics (can be suppressed) - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } else { - cat("\n", scale_name, ": Not enough items for alpha analysis.\n") - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Summary table for reporting -cat("\n=== SUMMARY TABLE ===\n") -knitr::kable(summary_tbl) - -cat("\n=== ALPHA IF DROPPED TABLE ===\n") -knitr::kable(alpha_dropped_tbl) - -# Export tables to CSV -write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) - -cat("\nTables exported to CSV files:\n") -cat("- reliability_summary_table.csv\n") -cat("- alpha_if_dropped_table.csv\n") diff --git a/.history/eohi2/reliability analysis_20251028151323.r b/.history/eohi2/reliability analysis_20251028151323.r deleted file mode 100644 index 63088e9..0000000 --- a/.history/eohi2/reliability analysis_20251028151323.r +++ /dev/null @@ -1,129 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } else { - cat("\n", scale_name, ": Not enough items for alpha analysis.\n") - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Summary table for reporting -cat("\n=== SUMMARY TABLE ===\n") -knitr::kable(summary_tbl) - -cat("\n=== ALPHA IF DROPPED TABLE ===\n") -knitr::kable(alpha_dropped_tbl) - -# Export tables to CSV -write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) - -cat("\nTables exported to CSV files:\n") -cat("- reliability_summary_table.csv\n") -cat("- alpha_if_dropped_table.csv\n") diff --git a/.history/eohi2/reliability analysis_20251028151326.r b/.history/eohi2/reliability analysis_20251028151326.r deleted file mode 100644 index 8eb5f45..0000000 --- a/.history/eohi2/reliability analysis_20251028151326.r +++ /dev/null @@ -1,127 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Summary table for reporting -cat("\n=== SUMMARY TABLE ===\n") -knitr::kable(summary_tbl) - -cat("\n=== ALPHA IF DROPPED TABLE ===\n") -knitr::kable(alpha_dropped_tbl) - -# Export tables to CSV -write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) - -cat("\nTables exported to CSV files:\n") -cat("- reliability_summary_table.csv\n") -cat("- alpha_if_dropped_table.csv\n") diff --git a/.history/eohi2/reliability analysis_20251028151330.r b/.history/eohi2/reliability analysis_20251028151330.r deleted file mode 100644 index 3596b78..0000000 --- a/.history/eohi2/reliability analysis_20251028151330.r +++ /dev/null @@ -1,121 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Summary table for reporting -knitr::kable(summary_tbl) - -knitr::kable(alpha_dropped_tbl) - -# Export tables to CSV -write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) diff --git a/.history/eohi2/reliability analysis_20251028151333.r b/.history/eohi2/reliability analysis_20251028151333.r deleted file mode 100644 index 3596b78..0000000 --- a/.history/eohi2/reliability analysis_20251028151333.r +++ /dev/null @@ -1,121 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Summary table for reporting -knitr::kable(summary_tbl) - -knitr::kable(alpha_dropped_tbl) - -# Export tables to CSV -write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) diff --git a/.history/eohi2/reliability analysis_20251028151339.r b/.history/eohi2/reliability analysis_20251028151339.r deleted file mode 100644 index 3596b78..0000000 --- a/.history/eohi2/reliability analysis_20251028151339.r +++ /dev/null @@ -1,121 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Summary table for reporting -knitr::kable(summary_tbl) - -knitr::kable(alpha_dropped_tbl) - -# Export tables to CSV -write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) diff --git a/.history/eohi2/reliability analysis_20251028151358.r b/.history/eohi2/reliability analysis_20251028151358.r deleted file mode 100644 index 4ec8bc2..0000000 --- a/.history/eohi2/reliability analysis_20251028151358.r +++ /dev/null @@ -1,121 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Summary table for reporting -knitr::kable(summary_tbl) - -knitr::kable(alpha_dropped_tbl) - -# Export tables to CSV -#write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -#write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) diff --git a/.history/eohi2/reliability analysis_20251028151540.r b/.history/eohi2/reliability analysis_20251028151540.r deleted file mode 100644 index e052909..0000000 --- a/.history/eohi2/reliability analysis_20251028151540.r +++ /dev/null @@ -1,123 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Summary table for reporting -cat("\n=== SUMMARY TABLE ===\n") -print(summary_tbl) - -cat("\n=== ALPHA IF DROPPED TABLE ===\n") -print(alpha_dropped_tbl) - -# Export tables to CSV -#write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -#write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) diff --git a/.history/eohi2/reliability analysis_20251028151544.r b/.history/eohi2/reliability analysis_20251028151544.r deleted file mode 100644 index e052909..0000000 --- a/.history/eohi2/reliability analysis_20251028151544.r +++ /dev/null @@ -1,123 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Summary table for reporting -cat("\n=== SUMMARY TABLE ===\n") -print(summary_tbl) - -cat("\n=== ALPHA IF DROPPED TABLE ===\n") -print(alpha_dropped_tbl) - -# Export tables to CSV -#write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -#write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) diff --git a/.history/eohi2/reliability analysis_20251028151808.r b/.history/eohi2/reliability analysis_20251028151808.r deleted file mode 100644 index a003e09..0000000 --- a/.history/eohi2/reliability analysis_20251028151808.r +++ /dev/null @@ -1,121 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Summary table for reporting -print(summary_tbl) - -print(alpha_dropped_tbl) - -# Export tables to CSV -#write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -#write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) diff --git a/.history/eohi2/reliability analysis_20251028151819.r b/.history/eohi2/reliability analysis_20251028151819.r deleted file mode 100644 index a003e09..0000000 --- a/.history/eohi2/reliability analysis_20251028151819.r +++ /dev/null @@ -1,121 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Summary table for reporting -print(summary_tbl) - -print(alpha_dropped_tbl) - -# Export tables to CSV -#write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -#write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) diff --git a/.history/eohi2/reliability analysis_20251028151900.r b/.history/eohi2/reliability analysis_20251028151900.r deleted file mode 100644 index a003e09..0000000 --- a/.history/eohi2/reliability analysis_20251028151900.r +++ /dev/null @@ -1,121 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Summary table for reporting -print(summary_tbl) - -print(alpha_dropped_tbl) - -# Export tables to CSV -#write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -#write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) diff --git a/.history/eohi2/reliability analysis_20251028151917.r b/.history/eohi2/reliability analysis_20251028151917.r deleted file mode 100644 index a7303b3..0000000 --- a/.history/eohi2/reliability analysis_20251028151917.r +++ /dev/null @@ -1,140 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed (indicated by negative signs in keys) - reversed_items <- names(alpha_out$keys)[alpha_out$keys < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } -} - -# Summary table for reporting -print(summary_tbl) - -print(alpha_dropped_tbl) - -# Export tables to CSV -#write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -#write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) diff --git a/.history/eohi2/reliability analysis_20251028151922.r b/.history/eohi2/reliability analysis_20251028151922.r deleted file mode 100644 index a7303b3..0000000 --- a/.history/eohi2/reliability analysis_20251028151922.r +++ /dev/null @@ -1,140 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed (indicated by negative signs in keys) - reversed_items <- names(alpha_out$keys)[alpha_out$keys < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } -} - -# Summary table for reporting -print(summary_tbl) - -print(alpha_dropped_tbl) - -# Export tables to CSV -#write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -#write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) diff --git a/.history/eohi2/reliability analysis_20251028151925.r b/.history/eohi2/reliability analysis_20251028151925.r deleted file mode 100644 index a7303b3..0000000 --- a/.history/eohi2/reliability analysis_20251028151925.r +++ /dev/null @@ -1,140 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed (indicated by negative signs in keys) - reversed_items <- names(alpha_out$keys)[alpha_out$keys < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } -} - -# Summary table for reporting -print(summary_tbl) - -print(alpha_dropped_tbl) - -# Export tables to CSV -#write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -#write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) diff --git a/.history/eohi2/reliability analysis_20251028152017.r b/.history/eohi2/reliability analysis_20251028152017.r deleted file mode 100644 index c6660b5..0000000 --- a/.history/eohi2/reliability analysis_20251028152017.r +++ /dev/null @@ -1,141 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed (indicated by negative signs in keys) - keys_vector <- as.numeric(alpha_out$keys) - reversed_items <- names(alpha_out$keys)[keys_vector < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } -} - -# Summary table for reporting -print(summary_tbl) - -print(alpha_dropped_tbl) - -# Export tables to CSV -#write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -#write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) diff --git a/.history/eohi2/reliability analysis_20251028152020.r b/.history/eohi2/reliability analysis_20251028152020.r deleted file mode 100644 index c6660b5..0000000 --- a/.history/eohi2/reliability analysis_20251028152020.r +++ /dev/null @@ -1,141 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed (indicated by negative signs in keys) - keys_vector <- as.numeric(alpha_out$keys) - reversed_items <- names(alpha_out$keys)[keys_vector < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } -} - -# Summary table for reporting -print(summary_tbl) - -print(alpha_dropped_tbl) - -# Export tables to CSV -#write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -#write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) diff --git a/.history/eohi2/reliability analysis_20251028152033.r b/.history/eohi2/reliability analysis_20251028152033.r deleted file mode 100644 index c6660b5..0000000 --- a/.history/eohi2/reliability analysis_20251028152033.r +++ /dev/null @@ -1,141 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed (indicated by negative signs in keys) - keys_vector <- as.numeric(alpha_out$keys) - reversed_items <- names(alpha_out$keys)[keys_vector < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } -} - -# Summary table for reporting -print(summary_tbl) - -print(alpha_dropped_tbl) - -# Export tables to CSV -#write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -#write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) diff --git a/.history/eohi2/reliability analysis_20251028152127.r b/.history/eohi2/reliability analysis_20251028152127.r deleted file mode 100644 index 7bb25dc..0000000 --- a/.history/eohi2/reliability analysis_20251028152127.r +++ /dev/null @@ -1,145 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - -# Summary table for reporting -print(summary_tbl) - -print(alpha_dropped_tbl) - -# Export tables to CSV -#write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -#write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) diff --git a/.history/eohi2/reliability analysis_20251028152132.r b/.history/eohi2/reliability analysis_20251028152132.r deleted file mode 100644 index b32a37f..0000000 --- a/.history/eohi2/reliability analysis_20251028152132.r +++ /dev/null @@ -1,148 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - -# Summary table for reporting -cat("\n=== SUMMARY TABLE ===\n") -print(summary_tbl) - -cat("\n=== ALPHA IF DROPPED TABLE ===\n") -print(alpha_dropped_tbl) -cat("Alpha dropped table has", nrow(alpha_dropped_tbl), "rows\n") - -# Export tables to CSV -#write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -#write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) diff --git a/.history/eohi2/reliability analysis_20251028152134.r b/.history/eohi2/reliability analysis_20251028152134.r deleted file mode 100644 index b32a37f..0000000 --- a/.history/eohi2/reliability analysis_20251028152134.r +++ /dev/null @@ -1,148 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - -# Summary table for reporting -cat("\n=== SUMMARY TABLE ===\n") -print(summary_tbl) - -cat("\n=== ALPHA IF DROPPED TABLE ===\n") -print(alpha_dropped_tbl) -cat("Alpha dropped table has", nrow(alpha_dropped_tbl), "rows\n") - -# Export tables to CSV -#write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -#write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) diff --git a/.history/eohi2/reliability analysis_20251028152156.r b/.history/eohi2/reliability analysis_20251028152156.r deleted file mode 100644 index b32a37f..0000000 --- a/.history/eohi2/reliability analysis_20251028152156.r +++ /dev/null @@ -1,148 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - -# Summary table for reporting -cat("\n=== SUMMARY TABLE ===\n") -print(summary_tbl) - -cat("\n=== ALPHA IF DROPPED TABLE ===\n") -print(alpha_dropped_tbl) -cat("Alpha dropped table has", nrow(alpha_dropped_tbl), "rows\n") - -# Export tables to CSV -#write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -#write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) diff --git a/.history/eohi2/reliability analysis_20251028161222.r b/.history/eohi2/reliability analysis_20251028161222.r deleted file mode 100644 index 1ad0cc7..0000000 --- a/.history/eohi2/reliability analysis_20251028161222.r +++ /dev/null @@ -1,140 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_tbl <- data.frame( - Scale = character(), - RawAlpha = numeric(), - StdAlpha = numeric(), - AvgItemRestCor = numeric() -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - -# Export tables to CSV -#write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -#write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) diff --git a/.history/eohi2/reliability analysis_20251028161229.r b/.history/eohi2/reliability analysis_20251028161229.r deleted file mode 100644 index 34352f7..0000000 --- a/.history/eohi2/reliability analysis_20251028161229.r +++ /dev/null @@ -1,134 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary stats for table - raw_alpha <- round(alpha_out$total$raw_alpha, 3) - std_alpha <- round(alpha_out$total$std.alpha, 3) - avg_ir <- round(mean(alpha_out$item.stats$r.drop, na.rm=TRUE), 3) - summary_tbl <- rbind(summary_tbl, data.frame( - Scale = scale_name, - RawAlpha = raw_alpha, - StdAlpha = std_alpha, - AvgItemRestCor = avg_ir - )) - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - -# Export tables to CSV -#write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -#write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) diff --git a/.history/eohi2/reliability analysis_20251028161236.r b/.history/eohi2/reliability analysis_20251028161236.r deleted file mode 100644 index 3679439..0000000 --- a/.history/eohi2/reliability analysis_20251028161236.r +++ /dev/null @@ -1,123 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - } -} - -# Create detailed alpha if dropped table -alpha_dropped_tbl <- data.frame( - Scale = character(), - Item = character(), - AlphaIfDropped = numeric(), - stringsAsFactors = FALSE -) - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Extract alpha if dropped for each item - for(i in 1:length(vars)) { - item_name <- vars[i] - alpha_dropped <- alpha_out$alpha.drop$raw_alpha[i] - - alpha_dropped_tbl <- rbind(alpha_dropped_tbl, data.frame( - Scale = scale_name, - Item = item_name, - AlphaIfDropped = round(alpha_dropped, 4), - stringsAsFactors = FALSE - )) - } - } -} - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - -# Export tables to CSV -#write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -#write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) diff --git a/.history/eohi2/reliability analysis_20251028161240.r b/.history/eohi2/reliability analysis_20251028161240.r deleted file mode 100644 index 62366f2..0000000 --- a/.history/eohi2/reliability analysis_20251028161240.r +++ /dev/null @@ -1,94 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - } -} - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - -# Export tables to CSV -#write.csv(summary_tbl, "reliability_summary_table.csv", row.names = FALSE) -#write.csv(alpha_dropped_tbl, "alpha_if_dropped_table.csv", row.names = FALSE) diff --git a/.history/eohi2/reliability analysis_20251028161242.r b/.history/eohi2/reliability analysis_20251028161242.r deleted file mode 100644 index 0da2aba..0000000 --- a/.history/eohi2/reliability analysis_20251028161242.r +++ /dev/null @@ -1,91 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - } -} - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028161247.r b/.history/eohi2/reliability analysis_20251028161247.r deleted file mode 100644 index 0da2aba..0000000 --- a/.history/eohi2/reliability analysis_20251028161247.r +++ /dev/null @@ -1,91 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - } -} - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028161950.r b/.history/eohi2/reliability analysis_20251028161950.r deleted file mode 100644 index 0da2aba..0000000 --- a/.history/eohi2/reliability analysis_20251028161950.r +++ /dev/null @@ -1,91 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - } -} - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028162118.r b/.history/eohi2/reliability analysis_20251028162118.r deleted file mode 100644 index c09b4e9..0000000 --- a/.history/eohi2/reliability analysis_20251028162118.r +++ /dev/null @@ -1,110 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -all_results <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Combine results for CSV export - scale_total <- alpha_out$total - scale_items <- alpha_out$item.stats - scale_alpha_drop <- alpha_out$alpha.drop - - # Add scale name to each row - scale_total$Scale <- scale_name - scale_items$Scale <- scale_name - scale_alpha_drop$Scale <- scale_name - - # Add row type identifier - scale_total$Output_Type <- "Total" - scale_items$Output_Type <- "Item_Stats" - scale_alpha_drop$Output_Type <- "Alpha_Drop" - - # Combine all results - all_results <- rbind(all_results, scale_total, scale_items, scale_alpha_drop) - } -} - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028162136.r b/.history/eohi2/reliability analysis_20251028162136.r deleted file mode 100644 index 594eb57..0000000 --- a/.history/eohi2/reliability analysis_20251028162136.r +++ /dev/null @@ -1,113 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -all_results <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Combine results for CSV export - scale_total <- alpha_out$total - scale_items <- alpha_out$item.stats - scale_alpha_drop <- alpha_out$alpha.drop - - # Add scale name to each row - scale_total$Scale <- scale_name - scale_items$Scale <- scale_name - scale_alpha_drop$Scale <- scale_name - - # Add row type identifier - scale_total$Output_Type <- "Total" - scale_items$Output_Type <- "Item_Stats" - scale_alpha_drop$Output_Type <- "Alpha_Drop" - - # Combine all results - all_results <- rbind(all_results, scale_total, scale_items, scale_alpha_drop) - } -} - -# Export results to CSV -write.csv(all_results, "reliability_analysis_results.csv", row.names = TRUE) -cat("\nResults exported to: reliability_analysis_results.csv\n") - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028162145.r b/.history/eohi2/reliability analysis_20251028162145.r deleted file mode 100644 index 594eb57..0000000 --- a/.history/eohi2/reliability analysis_20251028162145.r +++ /dev/null @@ -1,113 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -all_results <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Combine results for CSV export - scale_total <- alpha_out$total - scale_items <- alpha_out$item.stats - scale_alpha_drop <- alpha_out$alpha.drop - - # Add scale name to each row - scale_total$Scale <- scale_name - scale_items$Scale <- scale_name - scale_alpha_drop$Scale <- scale_name - - # Add row type identifier - scale_total$Output_Type <- "Total" - scale_items$Output_Type <- "Item_Stats" - scale_alpha_drop$Output_Type <- "Alpha_Drop" - - # Combine all results - all_results <- rbind(all_results, scale_total, scale_items, scale_alpha_drop) - } -} - -# Export results to CSV -write.csv(all_results, "reliability_analysis_results.csv", row.names = TRUE) -cat("\nResults exported to: reliability_analysis_results.csv\n") - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028162201.r b/.history/eohi2/reliability analysis_20251028162201.r deleted file mode 100644 index d8a7b60..0000000 --- a/.history/eohi2/reliability analysis_20251028162201.r +++ /dev/null @@ -1,113 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -all_results <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Combine results for CSV export - scale_total <- alpha_out$total - scale_items <- alpha_out$item.stats - scale_alpha_drop <- alpha_out$alpha.drop - - # Add scale name to each row - scale_total$Scale <- scale_name - scale_items$Scale <- scale_name - scale_alpha_drop$Scale <- scale_name - - # Add row type identifier - scale_total$Output_Type <- "Total" - scale_items$Output_Type <- "Item_Stats" - scale_alpha_drop$Output_Type <- "Alpha_Drop" - - # Combine all results - all_results <- rbind(all_results, scale_total, scale_items, scale_alpha_drop) - } -} - -# Export results to CSV -write.csv(all_results, "reliability_analysis_results.csv", row.names = TRUE) - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028162240.r b/.history/eohi2/reliability analysis_20251028162240.r deleted file mode 100644 index 67eed4a..0000000 --- a/.history/eohi2/reliability analysis_20251028162240.r +++ /dev/null @@ -1,114 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -total_results <- data.frame() -item_stats_results <- data.frame() -alpha_drop_results <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Prepare results for CSV export - scale_total <- alpha_out$total - scale_items <- alpha_out$item.stats - scale_alpha_drop <- alpha_out$alpha.drop - - # Add scale name to each row - scale_total$Scale <- scale_name - scale_items$Scale <- scale_name - scale_alpha_drop$Scale <- scale_name - - # Combine results by type - total_results <- rbind(total_results, scale_total) - item_stats_results <- rbind(item_stats_results, scale_items) - alpha_drop_results <- rbind(alpha_drop_results, scale_alpha_drop) - } -} - -# Export results to separate CSV files -write.csv(total_results, "reliability_total_stats.csv", row.names = TRUE) -write.csv(item_stats_results, "reliability_item_stats.csv", row.names = TRUE) -write.csv(alpha_drop_results, "reliability_alpha_drop.csv", row.names = TRUE) - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028162243.r b/.history/eohi2/reliability analysis_20251028162243.r deleted file mode 100644 index 67eed4a..0000000 --- a/.history/eohi2/reliability analysis_20251028162243.r +++ /dev/null @@ -1,114 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -total_results <- data.frame() -item_stats_results <- data.frame() -alpha_drop_results <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Prepare results for CSV export - scale_total <- alpha_out$total - scale_items <- alpha_out$item.stats - scale_alpha_drop <- alpha_out$alpha.drop - - # Add scale name to each row - scale_total$Scale <- scale_name - scale_items$Scale <- scale_name - scale_alpha_drop$Scale <- scale_name - - # Combine results by type - total_results <- rbind(total_results, scale_total) - item_stats_results <- rbind(item_stats_results, scale_items) - alpha_drop_results <- rbind(alpha_drop_results, scale_alpha_drop) - } -} - -# Export results to separate CSV files -write.csv(total_results, "reliability_total_stats.csv", row.names = TRUE) -write.csv(item_stats_results, "reliability_item_stats.csv", row.names = TRUE) -write.csv(alpha_drop_results, "reliability_alpha_drop.csv", row.names = TRUE) - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028162251.r b/.history/eohi2/reliability analysis_20251028162251.r deleted file mode 100644 index 67eed4a..0000000 --- a/.history/eohi2/reliability analysis_20251028162251.r +++ /dev/null @@ -1,114 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -total_results <- data.frame() -item_stats_results <- data.frame() -alpha_drop_results <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Prepare results for CSV export - scale_total <- alpha_out$total - scale_items <- alpha_out$item.stats - scale_alpha_drop <- alpha_out$alpha.drop - - # Add scale name to each row - scale_total$Scale <- scale_name - scale_items$Scale <- scale_name - scale_alpha_drop$Scale <- scale_name - - # Combine results by type - total_results <- rbind(total_results, scale_total) - item_stats_results <- rbind(item_stats_results, scale_items) - alpha_drop_results <- rbind(alpha_drop_results, scale_alpha_drop) - } -} - -# Export results to separate CSV files -write.csv(total_results, "reliability_total_stats.csv", row.names = TRUE) -write.csv(item_stats_results, "reliability_item_stats.csv", row.names = TRUE) -write.csv(alpha_drop_results, "reliability_alpha_drop.csv", row.names = TRUE) - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028162450.r b/.history/eohi2/reliability analysis_20251028162450.r deleted file mode 100644 index 02e2aa3..0000000 --- a/.history/eohi2/reliability analysis_20251028162450.r +++ /dev/null @@ -1,175 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -total_results <- data.frame() -item_stats_results <- data.frame() -alpha_drop_results <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Prepare results for CSV export - scale_total <- alpha_out$total - scale_items <- alpha_out$item.stats - scale_alpha_drop <- alpha_out$alpha.drop - - # Add scale name to each row - scale_total$Scale <- scale_name - scale_items$Scale <- scale_name - scale_alpha_drop$Scale <- scale_name - - # Combine results by type - total_results <- rbind(total_results, scale_total) - item_stats_results <- rbind(item_stats_results, scale_items) - alpha_drop_results <- rbind(alpha_drop_results, scale_alpha_drop) - } -} - -# Create HTML report -html_content <- " - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: " %+% Sys.time() %+% "

" - -# Add total statistics -html_content <- html_content %+% "

Overall Scale Statistics

-
- - " - -for(i in 1:nrow(total_results)) { - row <- total_results[i,] - html_content <- html_content %+% sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r) -} - -html_content <- html_content %+% "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
" - -# Add item statistics -html_content <- html_content %+% "

Item Statistics

- - " - -for(i in 1:nrow(item_stats_results)) { - row <- item_stats_results[i,] - html_content <- html_content %+% sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd) -} - -html_content <- html_content %+% "
ScaleItemnRaw rStd rr.corr.dropMeanSD
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f
" - -# Add alpha if dropped -html_content <- html_content %+% "

Alpha If Item Dropped

- - " - -for(i in 1:nrow(alpha_drop_results)) { - row <- alpha_drop_results[i,] - html_content <- html_content %+% sprintf("", - row$Scale, rownames(row), row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$alpha.se, row$var.r, row$med.r) -} - -html_content <- html_content %+% "
ScaleItemRaw AlphaStd AlphaG6(SMC)Average rS/NAlpha SEVar rMed r
%s%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
- -" - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028162455.r b/.history/eohi2/reliability analysis_20251028162455.r deleted file mode 100644 index 02e2aa3..0000000 --- a/.history/eohi2/reliability analysis_20251028162455.r +++ /dev/null @@ -1,175 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -total_results <- data.frame() -item_stats_results <- data.frame() -alpha_drop_results <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Prepare results for CSV export - scale_total <- alpha_out$total - scale_items <- alpha_out$item.stats - scale_alpha_drop <- alpha_out$alpha.drop - - # Add scale name to each row - scale_total$Scale <- scale_name - scale_items$Scale <- scale_name - scale_alpha_drop$Scale <- scale_name - - # Combine results by type - total_results <- rbind(total_results, scale_total) - item_stats_results <- rbind(item_stats_results, scale_items) - alpha_drop_results <- rbind(alpha_drop_results, scale_alpha_drop) - } -} - -# Create HTML report -html_content <- " - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: " %+% Sys.time() %+% "

" - -# Add total statistics -html_content <- html_content %+% "

Overall Scale Statistics

-
- - " - -for(i in 1:nrow(total_results)) { - row <- total_results[i,] - html_content <- html_content %+% sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r) -} - -html_content <- html_content %+% "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
" - -# Add item statistics -html_content <- html_content %+% "

Item Statistics

- - " - -for(i in 1:nrow(item_stats_results)) { - row <- item_stats_results[i,] - html_content <- html_content %+% sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd) -} - -html_content <- html_content %+% "
ScaleItemnRaw rStd rr.corr.dropMeanSD
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f
" - -# Add alpha if dropped -html_content <- html_content %+% "

Alpha If Item Dropped

- - " - -for(i in 1:nrow(alpha_drop_results)) { - row <- alpha_drop_results[i,] - html_content <- html_content %+% sprintf("", - row$Scale, rownames(row), row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$alpha.se, row$var.r, row$med.r) -} - -html_content <- html_content %+% "
ScaleItemRaw AlphaStd AlphaG6(SMC)Average rS/NAlpha SEVar rMed r
%s%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
- -" - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028162504.r b/.history/eohi2/reliability analysis_20251028162504.r deleted file mode 100644 index 02e2aa3..0000000 --- a/.history/eohi2/reliability analysis_20251028162504.r +++ /dev/null @@ -1,175 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -total_results <- data.frame() -item_stats_results <- data.frame() -alpha_drop_results <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Prepare results for CSV export - scale_total <- alpha_out$total - scale_items <- alpha_out$item.stats - scale_alpha_drop <- alpha_out$alpha.drop - - # Add scale name to each row - scale_total$Scale <- scale_name - scale_items$Scale <- scale_name - scale_alpha_drop$Scale <- scale_name - - # Combine results by type - total_results <- rbind(total_results, scale_total) - item_stats_results <- rbind(item_stats_results, scale_items) - alpha_drop_results <- rbind(alpha_drop_results, scale_alpha_drop) - } -} - -# Create HTML report -html_content <- " - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: " %+% Sys.time() %+% "

" - -# Add total statistics -html_content <- html_content %+% "

Overall Scale Statistics

-
- - " - -for(i in 1:nrow(total_results)) { - row <- total_results[i,] - html_content <- html_content %+% sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r) -} - -html_content <- html_content %+% "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
" - -# Add item statistics -html_content <- html_content %+% "

Item Statistics

- - " - -for(i in 1:nrow(item_stats_results)) { - row <- item_stats_results[i,] - html_content <- html_content %+% sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd) -} - -html_content <- html_content %+% "
ScaleItemnRaw rStd rr.corr.dropMeanSD
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f
" - -# Add alpha if dropped -html_content <- html_content %+% "

Alpha If Item Dropped

- - " - -for(i in 1:nrow(alpha_drop_results)) { - row <- alpha_drop_results[i,] - html_content <- html_content %+% sprintf("", - row$Scale, rownames(row), row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$alpha.se, row$var.r, row$med.r) -} - -html_content <- html_content %+% "
ScaleItemRaw AlphaStd AlphaG6(SMC)Average rS/NAlpha SEVar rMed r
%s%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
- -" - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028162602.r b/.history/eohi2/reliability analysis_20251028162602.r deleted file mode 100644 index 32fbc51..0000000 --- a/.history/eohi2/reliability analysis_20251028162602.r +++ /dev/null @@ -1,175 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -total_results <- data.frame() -item_stats_results <- data.frame() -alpha_drop_results <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Prepare results for CSV export - scale_total <- alpha_out$total - scale_items <- alpha_out$item.stats - scale_alpha_drop <- alpha_out$alpha.drop - - # Add scale name to each row - scale_total$Scale <- scale_name - scale_items$Scale <- scale_name - scale_alpha_drop$Scale <- scale_name - - # Combine results by type - total_results <- rbind(total_results, scale_total) - item_stats_results <- rbind(item_stats_results, scale_items) - alpha_drop_results <- rbind(alpha_drop_results, scale_alpha_drop) - } -} - -# Create HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

") - -# Add total statistics -html_content <- paste0(html_content, "

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(total_results)) { - row <- total_results[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_stats_results)) { - row <- item_stats_results[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSD
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add alpha if dropped -html_content <- paste0(html_content, "

Alpha If Item Dropped

- - ") - -for(i in 1:nrow(alpha_drop_results)) { - row <- alpha_drop_results[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$alpha.se, row$var.r, row$med.r)) -} - -html_content <- paste0(html_content, "
ScaleItemRaw AlphaStd AlphaG6(SMC)Average rS/NAlpha SEVar rMed r
%s%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
- -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028162606.r b/.history/eohi2/reliability analysis_20251028162606.r deleted file mode 100644 index 32fbc51..0000000 --- a/.history/eohi2/reliability analysis_20251028162606.r +++ /dev/null @@ -1,175 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -total_results <- data.frame() -item_stats_results <- data.frame() -alpha_drop_results <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Prepare results for CSV export - scale_total <- alpha_out$total - scale_items <- alpha_out$item.stats - scale_alpha_drop <- alpha_out$alpha.drop - - # Add scale name to each row - scale_total$Scale <- scale_name - scale_items$Scale <- scale_name - scale_alpha_drop$Scale <- scale_name - - # Combine results by type - total_results <- rbind(total_results, scale_total) - item_stats_results <- rbind(item_stats_results, scale_items) - alpha_drop_results <- rbind(alpha_drop_results, scale_alpha_drop) - } -} - -# Create HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

") - -# Add total statistics -html_content <- paste0(html_content, "

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(total_results)) { - row <- total_results[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_stats_results)) { - row <- item_stats_results[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSD
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add alpha if dropped -html_content <- paste0(html_content, "

Alpha If Item Dropped

- - ") - -for(i in 1:nrow(alpha_drop_results)) { - row <- alpha_drop_results[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$alpha.se, row$var.r, row$med.r)) -} - -html_content <- paste0(html_content, "
ScaleItemRaw AlphaStd AlphaG6(SMC)Average rS/NAlpha SEVar rMed r
%s%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
- -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028162608.r b/.history/eohi2/reliability analysis_20251028162608.r deleted file mode 100644 index 32fbc51..0000000 --- a/.history/eohi2/reliability analysis_20251028162608.r +++ /dev/null @@ -1,175 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -total_results <- data.frame() -item_stats_results <- data.frame() -alpha_drop_results <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Prepare results for CSV export - scale_total <- alpha_out$total - scale_items <- alpha_out$item.stats - scale_alpha_drop <- alpha_out$alpha.drop - - # Add scale name to each row - scale_total$Scale <- scale_name - scale_items$Scale <- scale_name - scale_alpha_drop$Scale <- scale_name - - # Combine results by type - total_results <- rbind(total_results, scale_total) - item_stats_results <- rbind(item_stats_results, scale_items) - alpha_drop_results <- rbind(alpha_drop_results, scale_alpha_drop) - } -} - -# Create HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

") - -# Add total statistics -html_content <- paste0(html_content, "

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(total_results)) { - row <- total_results[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_stats_results)) { - row <- item_stats_results[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSD
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add alpha if dropped -html_content <- paste0(html_content, "

Alpha If Item Dropped

- - ") - -for(i in 1:nrow(alpha_drop_results)) { - row <- alpha_drop_results[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$alpha.se, row$var.r, row$med.r)) -} - -html_content <- paste0(html_content, "
ScaleItemRaw AlphaStd AlphaG6(SMC)Average rS/NAlpha SEVar rMed r
%s%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
- -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028162820.r b/.history/eohi2/reliability analysis_20251028162820.r deleted file mode 100644 index 6a63d2f..0000000 --- a/.history/eohi2/reliability analysis_20251028162820.r +++ /dev/null @@ -1,134 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - total_row <- alpha_out$total - total_row$Scale <- scale_name - summary_data <- rbind(summary_data, total_row) - } -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
- -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028162852.r b/.history/eohi2/reliability analysis_20251028162852.r deleted file mode 100644 index 6a63d2f..0000000 --- a/.history/eohi2/reliability analysis_20251028162852.r +++ /dev/null @@ -1,134 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - total_row <- alpha_out$total - total_row$Scale <- scale_name - summary_data <- rbind(summary_data, total_row) - } -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
- -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028162935.r b/.history/eohi2/reliability analysis_20251028162935.r deleted file mode 100644 index f1c45d1..0000000 --- a/.history/eohi2/reliability analysis_20251028162935.r +++ /dev/null @@ -1,142 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - total_row <- alpha_out$total - total_row$Scale <- scale_name - summary_data <- rbind(summary_data, total_row) - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
- -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028162951.r b/.history/eohi2/reliability analysis_20251028162951.r deleted file mode 100644 index f5f7c39..0000000 --- a/.history/eohi2/reliability analysis_20251028162951.r +++ /dev/null @@ -1,155 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - total_row <- alpha_out$total - total_row$Scale <- scale_name - summary_data <- rbind(summary_data, total_row) - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
- -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028162955.r b/.history/eohi2/reliability analysis_20251028162955.r deleted file mode 100644 index f5f7c39..0000000 --- a/.history/eohi2/reliability analysis_20251028162955.r +++ /dev/null @@ -1,155 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - total_row <- alpha_out$total - total_row$Scale <- scale_name - summary_data <- rbind(summary_data, total_row) - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
- -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028163006.r b/.history/eohi2/reliability analysis_20251028163006.r deleted file mode 100644 index f5f7c39..0000000 --- a/.history/eohi2/reliability analysis_20251028163006.r +++ /dev/null @@ -1,155 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - total_row <- alpha_out$total - total_row$Scale <- scale_name - summary_data <- rbind(summary_data, total_row) - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
- -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028163158.r b/.history/eohi2/reliability analysis_20251028163158.r deleted file mode 100644 index 51f7cf5..0000000 --- a/.history/eohi2/reliability analysis_20251028163158.r +++ /dev/null @@ -1,158 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - total_row <- alpha_out$total - total_row$Scale <- scale_name - summary_data <- rbind(summary_data, total_row) - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- rows in summary_data:", nrow(summary_data), "\n") - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
- -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028163211.r b/.history/eohi2/reliability analysis_20251028163211.r deleted file mode 100644 index 9877efc..0000000 --- a/.history/eohi2/reliability analysis_20251028163211.r +++ /dev/null @@ -1,164 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - total_row <- alpha_out$total - total_row$Scale <- scale_name - summary_data <- rbind(summary_data, total_row) - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- rows in summary_data:", nrow(summary_data), "\n") - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
- -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028163215.r b/.history/eohi2/reliability analysis_20251028163215.r deleted file mode 100644 index 9877efc..0000000 --- a/.history/eohi2/reliability analysis_20251028163215.r +++ /dev/null @@ -1,164 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - total_row <- alpha_out$total - total_row$Scale <- scale_name - summary_data <- rbind(summary_data, total_row) - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- rows in summary_data:", nrow(summary_data), "\n") - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
- -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028163222.r b/.history/eohi2/reliability analysis_20251028163222.r deleted file mode 100644 index 9877efc..0000000 --- a/.history/eohi2/reliability analysis_20251028163222.r +++ /dev/null @@ -1,164 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - total_row <- alpha_out$total - total_row$Scale <- scale_name - summary_data <- rbind(summary_data, total_row) - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- rows in summary_data:", nrow(summary_data), "\n") - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
- -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028163310.r b/.history/eohi2/reliability analysis_20251028163310.r deleted file mode 100644 index bb2754d..0000000 --- a/.history/eohi2/reliability analysis_20251028163310.r +++ /dev/null @@ -1,165 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - total_row <- alpha_out$total - total_row$Scale <- scale_name - summary_data <- rbind(summary_data, total_row) - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- rows in summary_data:", nrow(summary_data), "\n") - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
- -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028163331.r b/.history/eohi2/reliability analysis_20251028163331.r deleted file mode 100644 index 7f2abb0..0000000 --- a/.history/eohi2/reliability analysis_20251028163331.r +++ /dev/null @@ -1,186 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() -icc_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Calculate ICC(2,1) - tryCatch({ - icc_result <- icc(scale_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", scale_name, ":", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Scale = scale_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - icc_data <- rbind(icc_data, icc_row) - }, error = function(e) { - cat("ICC calculation failed for", scale_name, ":", e$message, "\n") - }) - - # Collect summary data for HTML - total_row <- alpha_out$total - total_row$Scale <- scale_name - summary_data <- rbind(summary_data, total_row) - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- rows in summary_data:", nrow(summary_data), "\n") - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
- -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028163339.r b/.history/eohi2/reliability analysis_20251028163339.r deleted file mode 100644 index ebaa792..0000000 --- a/.history/eohi2/reliability analysis_20251028163339.r +++ /dev/null @@ -1,203 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() -icc_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Calculate ICC(2,1) - tryCatch({ - icc_result <- icc(scale_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", scale_name, ":", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Scale = scale_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - icc_data <- rbind(icc_data, icc_row) - }, error = function(e) { - cat("ICC calculation failed for", scale_name, ":", e$message, "\n") - }) - - # Collect summary data for HTML - total_row <- alpha_out$total - total_row$Scale <- scale_name - summary_data <- rbind(summary_data, total_row) - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- rows in summary_data:", nrow(summary_data), "\n") - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ScaleICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028163348.r b/.history/eohi2/reliability analysis_20251028163348.r deleted file mode 100644 index ebaa792..0000000 --- a/.history/eohi2/reliability analysis_20251028163348.r +++ /dev/null @@ -1,203 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() -icc_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Calculate ICC(2,1) - tryCatch({ - icc_result <- icc(scale_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", scale_name, ":", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Scale = scale_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - icc_data <- rbind(icc_data, icc_row) - }, error = function(e) { - cat("ICC calculation failed for", scale_name, ":", e$message, "\n") - }) - - # Collect summary data for HTML - total_row <- alpha_out$total - total_row$Scale <- scale_name - summary_data <- rbind(summary_data, total_row) - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- rows in summary_data:", nrow(summary_data), "\n") - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ScaleICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028163622.r b/.history/eohi2/reliability analysis_20251028163622.r deleted file mode 100644 index ebaa792..0000000 --- a/.history/eohi2/reliability analysis_20251028163622.r +++ /dev/null @@ -1,203 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() -icc_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Calculate ICC(2,1) - tryCatch({ - icc_result <- icc(scale_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", scale_name, ":", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Scale = scale_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - icc_data <- rbind(icc_data, icc_row) - }, error = function(e) { - cat("ICC calculation failed for", scale_name, ":", e$message, "\n") - }) - - # Collect summary data for HTML - total_row <- alpha_out$total - total_row$Scale <- scale_name - summary_data <- rbind(summary_data, total_row) - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- rows in summary_data:", nrow(summary_data), "\n") - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ScaleICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028164143.r b/.history/eohi2/reliability analysis_20251028164143.r deleted file mode 100644 index cbf7b12..0000000 --- a/.history/eohi2/reliability analysis_20251028164143.r +++ /dev/null @@ -1,231 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - summary_data <- rbind(summary_data, total_row) - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - icc_data <- rbind(icc_data, icc_row) - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ScaleICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028164153.r b/.history/eohi2/reliability analysis_20251028164153.r deleted file mode 100644 index 2e1611f..0000000 --- a/.history/eohi2/reliability analysis_20251028164153.r +++ /dev/null @@ -1,231 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - summary_data <- rbind(summary_data, total_row) - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - icc_data <- rbind(icc_data, icc_row) - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028164158.r b/.history/eohi2/reliability analysis_20251028164158.r deleted file mode 100644 index 2e1611f..0000000 --- a/.history/eohi2/reliability analysis_20251028164158.r +++ /dev/null @@ -1,231 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - summary_data <- rbind(summary_data, total_row) - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - icc_data <- rbind(icc_data, icc_row) - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028164324.r b/.history/eohi2/reliability analysis_20251028164324.r deleted file mode 100644 index 2e1611f..0000000 --- a/.history/eohi2/reliability analysis_20251028164324.r +++ /dev/null @@ -1,231 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - summary_data <- rbind(summary_data, total_row) - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - icc_data <- rbind(icc_data, icc_row) - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028164342.r b/.history/eohi2/reliability analysis_20251028164342.r deleted file mode 100644 index bcad1c5..0000000 --- a/.history/eohi2/reliability analysis_20251028164342.r +++ /dev/null @@ -1,235 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - icc_data <- rbind(icc_data, icc_row) - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028164345.r b/.history/eohi2/reliability analysis_20251028164345.r deleted file mode 100644 index 4f834cc..0000000 --- a/.history/eohi2/reliability analysis_20251028164345.r +++ /dev/null @@ -1,239 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - if(nrow(item_data) == 0) { - item_data <- item_row - } else { - item_data <- rbind(item_data, item_row) - } - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - icc_data <- rbind(icc_data, icc_row) - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028164350.r b/.history/eohi2/reliability analysis_20251028164350.r deleted file mode 100644 index 4f834cc..0000000 --- a/.history/eohi2/reliability analysis_20251028164350.r +++ /dev/null @@ -1,239 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - if(nrow(item_data) == 0) { - item_data <- item_row - } else { - item_data <- rbind(item_data, item_row) - } - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - icc_data <- rbind(icc_data, icc_row) - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028164357.r b/.history/eohi2/reliability analysis_20251028164357.r deleted file mode 100644 index 4f834cc..0000000 --- a/.history/eohi2/reliability analysis_20251028164357.r +++ /dev/null @@ -1,239 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - if(nrow(item_data) == 0) { - item_data <- item_row - } else { - item_data <- rbind(item_data, item_row) - } - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - icc_data <- rbind(icc_data, icc_row) - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028164422.r b/.history/eohi2/reliability analysis_20251028164422.r deleted file mode 100644 index 0e6545e..0000000 --- a/.history/eohi2/reliability analysis_20251028164422.r +++ /dev/null @@ -1,243 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - if(nrow(item_data) == 0) { - item_data <- item_row - } else { - item_data <- rbind(item_data, item_row) - } - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028164426.r b/.history/eohi2/reliability analysis_20251028164426.r deleted file mode 100644 index 0e6545e..0000000 --- a/.history/eohi2/reliability analysis_20251028164426.r +++ /dev/null @@ -1,243 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - if(nrow(item_data) == 0) { - item_data <- item_row - } else { - item_data <- rbind(item_data, item_row) - } - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028164514.r b/.history/eohi2/reliability analysis_20251028164514.r deleted file mode 100644 index 0e6545e..0000000 --- a/.history/eohi2/reliability analysis_20251028164514.r +++ /dev/null @@ -1,243 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - if(nrow(item_data) == 0) { - item_data <- item_row - } else { - item_data <- rbind(item_data, item_row) - } - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028164546.r b/.history/eohi2/reliability analysis_20251028164546.r deleted file mode 100644 index d8d4d29..0000000 --- a/.history/eohi2/reliability analysis_20251028164546.r +++ /dev/null @@ -1,239 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028164551.r b/.history/eohi2/reliability analysis_20251028164551.r deleted file mode 100644 index d8d4d29..0000000 --- a/.history/eohi2/reliability analysis_20251028164551.r +++ /dev/null @@ -1,239 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028164558.r b/.history/eohi2/reliability analysis_20251028164558.r deleted file mode 100644 index d8d4d29..0000000 --- a/.history/eohi2/reliability analysis_20251028164558.r +++ /dev/null @@ -1,239 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028164641.r b/.history/eohi2/reliability analysis_20251028164641.r deleted file mode 100644 index 8f85e3d..0000000 --- a/.history/eohi2/reliability analysis_20251028164641.r +++ /dev/null @@ -1,244 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - if(ncol(construct_data) == 0) { - construct_data <- data.frame(scale_mean) - colnames(construct_data)[1] <- scale_name - } else { - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028164650.r b/.history/eohi2/reliability analysis_20251028164650.r deleted file mode 100644 index 8f85e3d..0000000 --- a/.history/eohi2/reliability analysis_20251028164650.r +++ /dev/null @@ -1,244 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - if(ncol(construct_data) == 0) { - construct_data <- data.frame(scale_mean) - colnames(construct_data)[1] <- scale_name - } else { - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028164657.r b/.history/eohi2/reliability analysis_20251028164657.r deleted file mode 100644 index 8f85e3d..0000000 --- a/.history/eohi2/reliability analysis_20251028164657.r +++ /dev/null @@ -1,244 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - if(ncol(construct_data) == 0) { - construct_data <- data.frame(scale_mean) - colnames(construct_data)[1] <- scale_name - } else { - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028164813.r b/.history/eohi2/reliability analysis_20251028164813.r deleted file mode 100644 index 1eec1a6..0000000 --- a/.history/eohi2/reliability analysis_20251028164813.r +++ /dev/null @@ -1,248 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- columns:", paste(colnames(total_row), collapse = ", "), "\n") - - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - if(ncol(construct_data) == 0) { - construct_data <- data.frame(scale_mean) - colnames(construct_data)[1] <- scale_name - } else { - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028164819.r b/.history/eohi2/reliability analysis_20251028164819.r deleted file mode 100644 index 8c25f77..0000000 --- a/.history/eohi2/reliability analysis_20251028164819.r +++ /dev/null @@ -1,253 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- columns:", paste(colnames(total_row), collapse = ", "), "\n") - - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - if(ncol(construct_data) == 0) { - construct_data <- data.frame(scale_mean) - colnames(construct_data)[1] <- scale_name - } else { - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - - # Debug: print what we're trying to access - cat("Row", i, "- trying to access columns:", paste(colnames(row), collapse = ", "), "\n") - cat("Raw alpha value:", row$raw_alpha, "\n") - - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028164821.r b/.history/eohi2/reliability analysis_20251028164821.r deleted file mode 100644 index 8c25f77..0000000 --- a/.history/eohi2/reliability analysis_20251028164821.r +++ /dev/null @@ -1,253 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- columns:", paste(colnames(total_row), collapse = ", "), "\n") - - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - if(ncol(construct_data) == 0) { - construct_data <- data.frame(scale_mean) - colnames(construct_data)[1] <- scale_name - } else { - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - - # Debug: print what we're trying to access - cat("Row", i, "- trying to access columns:", paste(colnames(row), collapse = ", "), "\n") - cat("Raw alpha value:", row$raw_alpha, "\n") - - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028165006.r b/.history/eohi2/reliability analysis_20251028165006.r deleted file mode 100644 index 8c25f77..0000000 --- a/.history/eohi2/reliability analysis_20251028165006.r +++ /dev/null @@ -1,253 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- columns:", paste(colnames(total_row), collapse = ", "), "\n") - - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - if(ncol(construct_data) == 0) { - construct_data <- data.frame(scale_mean) - colnames(construct_data)[1] <- scale_name - } else { - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - - # Debug: print what we're trying to access - cat("Row", i, "- trying to access columns:", paste(colnames(row), collapse = ", "), "\n") - cat("Raw alpha value:", row$raw_alpha, "\n") - - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028165109.r b/.history/eohi2/reliability analysis_20251028165109.r deleted file mode 100644 index db0a695..0000000 --- a/.history/eohi2/reliability analysis_20251028165109.r +++ /dev/null @@ -1,257 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- columns:", paste(colnames(total_row), collapse = ", "), "\n") - - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - if(ncol(construct_data) == 0) { - construct_data <- data.frame(scale_mean) - colnames(construct_data)[1] <- scale_name - } else { - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") - cat("First few rows:\n") - print(head(summary_data)) -} else { - cat("ERROR: summary_data is empty!\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - - # Debug: print what we're trying to access - cat("Row", i, "- trying to access columns:", paste(colnames(row), collapse = ", "), "\n") - cat("Raw alpha value:", row$raw_alpha, "\n") - - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028165118.r b/.history/eohi2/reliability analysis_20251028165118.r deleted file mode 100644 index b51c3f6..0000000 --- a/.history/eohi2/reliability analysis_20251028165118.r +++ /dev/null @@ -1,262 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- columns:", paste(colnames(total_row), collapse = ", "), "\n") - - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - if(ncol(construct_data) == 0) { - construct_data <- data.frame(scale_mean) - colnames(construct_data)[1] <- scale_name - } else { - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") - cat("First few rows:\n") - print(head(summary_data)) -} else { - cat("ERROR: summary_data is empty!\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -if(nrow(summary_data) > 0) { - for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - - # Debug: print what we're trying to access - cat("Row", i, "- trying to access columns:", paste(colnames(row), collapse = ", "), "\n") - cat("Raw alpha value:", row$raw_alpha, "\n") - - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) - } -} else { - cat("ERROR: No summary data to display!\n") - html_content <- paste0(html_content, "") -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
No data available
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028165120.r b/.history/eohi2/reliability analysis_20251028165120.r deleted file mode 100644 index b51c3f6..0000000 --- a/.history/eohi2/reliability analysis_20251028165120.r +++ /dev/null @@ -1,262 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- columns:", paste(colnames(total_row), collapse = ", "), "\n") - - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - if(ncol(construct_data) == 0) { - construct_data <- data.frame(scale_mean) - colnames(construct_data)[1] <- scale_name - } else { - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") - cat("First few rows:\n") - print(head(summary_data)) -} else { - cat("ERROR: summary_data is empty!\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -if(nrow(summary_data) > 0) { - for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - - # Debug: print what we're trying to access - cat("Row", i, "- trying to access columns:", paste(colnames(row), collapse = ", "), "\n") - cat("Raw alpha value:", row$raw_alpha, "\n") - - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) - } -} else { - cat("ERROR: No summary data to display!\n") - html_content <- paste0(html_content, "") -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
No data available
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028165128.r b/.history/eohi2/reliability analysis_20251028165128.r deleted file mode 100644 index b51c3f6..0000000 --- a/.history/eohi2/reliability analysis_20251028165128.r +++ /dev/null @@ -1,262 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- columns:", paste(colnames(total_row), collapse = ", "), "\n") - - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - if(ncol(construct_data) == 0) { - construct_data <- data.frame(scale_mean) - colnames(construct_data)[1] <- scale_name - } else { - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") - cat("First few rows:\n") - print(head(summary_data)) -} else { - cat("ERROR: summary_data is empty!\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -if(nrow(summary_data) > 0) { - for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - - # Debug: print what we're trying to access - cat("Row", i, "- trying to access columns:", paste(colnames(row), collapse = ", "), "\n") - cat("Raw alpha value:", row$raw_alpha, "\n") - - html_content <- paste0(html_content, sprintf("", - row$Scale, row$raw_alpha, row$std.alpha, row$G6.smc, row$average_r, row$S.N, row$ase, row$mean, row$sd, row$median_r)) - } -} else { - cat("ERROR: No summary data to display!\n") - html_content <- paste0(html_content, "") -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
No data available
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028165402.r b/.history/eohi2/reliability analysis_20251028165402.r deleted file mode 100644 index ca9dccb..0000000 --- a/.history/eohi2/reliability analysis_20251028165402.r +++ /dev/null @@ -1,262 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- columns:", paste(colnames(total_row), collapse = ", "), "\n") - - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - if(ncol(construct_data) == 0) { - construct_data <- data.frame(scale_mean) - colnames(construct_data)[1] <- scale_name - } else { - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") - cat("First few rows:\n") - print(head(summary_data)) -} else { - cat("ERROR: summary_data is empty!\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -if(nrow(summary_data) > 0) { - for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - - # Debug: print what we're trying to access - cat("Row", i, "- trying to access columns:", paste(colnames(row), collapse = ", "), "\n") - cat("Raw alpha value:", row$raw_alpha, "\n") - - html_content <- paste0(html_content, sprintf("", - row[["Scale"]], row[["raw_alpha"]], row[["std.alpha"]], row[["G6(smc)"]], row[["average_r"]], row[["S/N"]], row[["ase"]], row[["mean"]], row[["sd"]], row[["median_r"]])) - } -} else { - cat("ERROR: No summary data to display!\n") - html_content <- paste0(html_content, "") -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
No data available
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028165410.r b/.history/eohi2/reliability analysis_20251028165410.r deleted file mode 100644 index ca9dccb..0000000 --- a/.history/eohi2/reliability analysis_20251028165410.r +++ /dev/null @@ -1,262 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- columns:", paste(colnames(total_row), collapse = ", "), "\n") - - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - if(ncol(construct_data) == 0) { - construct_data <- data.frame(scale_mean) - colnames(construct_data)[1] <- scale_name - } else { - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") - cat("First few rows:\n") - print(head(summary_data)) -} else { - cat("ERROR: summary_data is empty!\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -if(nrow(summary_data) > 0) { - for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - - # Debug: print what we're trying to access - cat("Row", i, "- trying to access columns:", paste(colnames(row), collapse = ", "), "\n") - cat("Raw alpha value:", row$raw_alpha, "\n") - - html_content <- paste0(html_content, sprintf("", - row[["Scale"]], row[["raw_alpha"]], row[["std.alpha"]], row[["G6(smc)"]], row[["average_r"]], row[["S/N"]], row[["ase"]], row[["mean"]], row[["sd"]], row[["median_r"]])) - } -} else { - cat("ERROR: No summary data to display!\n") - html_content <- paste0(html_content, "") -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
No data available
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028165435.r b/.history/eohi2/reliability analysis_20251028165435.r deleted file mode 100644 index ca9dccb..0000000 --- a/.history/eohi2/reliability analysis_20251028165435.r +++ /dev/null @@ -1,262 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- columns:", paste(colnames(total_row), collapse = ", "), "\n") - - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - if(ncol(construct_data) == 0) { - construct_data <- data.frame(scale_mean) - colnames(construct_data)[1] <- scale_name - } else { - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") - cat("First few rows:\n") - print(head(summary_data)) -} else { - cat("ERROR: summary_data is empty!\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -if(nrow(summary_data) > 0) { - for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - - # Debug: print what we're trying to access - cat("Row", i, "- trying to access columns:", paste(colnames(row), collapse = ", "), "\n") - cat("Raw alpha value:", row$raw_alpha, "\n") - - html_content <- paste0(html_content, sprintf("", - row[["Scale"]], row[["raw_alpha"]], row[["std.alpha"]], row[["G6(smc)"]], row[["average_r"]], row[["S/N"]], row[["ase"]], row[["mean"]], row[["sd"]], row[["median_r"]])) - } -} else { - cat("ERROR: No summary data to display!\n") - html_content <- paste0(html_content, "") -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
No data available
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028170716.r b/.history/eohi2/reliability analysis_20251028170716.r deleted file mode 100644 index 6041ff6..0000000 --- a/.history/eohi2/reliability analysis_20251028170716.r +++ /dev/null @@ -1,265 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- columns:", paste(colnames(total_row), collapse = ", "), "\n") - - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - if(ncol(construct_data) == 0) { - construct_data <- data.frame(scale_mean) - colnames(construct_data)[1] <- scale_name - } else { - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - - # Debug: print actual p-value - cat("Actual p-value for", construct_name, ":", icc_result$p.value, "\n") - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") - cat("First few rows:\n") - print(head(summary_data)) -} else { - cat("ERROR: summary_data is empty!\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -if(nrow(summary_data) > 0) { - for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - - # Debug: print what we're trying to access - cat("Row", i, "- trying to access columns:", paste(colnames(row), collapse = ", "), "\n") - cat("Raw alpha value:", row$raw_alpha, "\n") - - html_content <- paste0(html_content, sprintf("", - row[["Scale"]], row[["raw_alpha"]], row[["std.alpha"]], row[["G6(smc)"]], row[["average_r"]], row[["S/N"]], row[["ase"]], row[["mean"]], row[["sd"]], row[["median_r"]])) - } -} else { - cat("ERROR: No summary data to display!\n") - html_content <- paste0(html_content, "") -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
No data available
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, row$p_Value)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%.4f
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028170719.r b/.history/eohi2/reliability analysis_20251028170719.r deleted file mode 100644 index 0b8583e..0000000 --- a/.history/eohi2/reliability analysis_20251028170719.r +++ /dev/null @@ -1,274 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- columns:", paste(colnames(total_row), collapse = ", "), "\n") - - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - if(ncol(construct_data) == 0) { - construct_data <- data.frame(scale_mean) - colnames(construct_data)[1] <- scale_name - } else { - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - - # Debug: print actual p-value - cat("Actual p-value for", construct_name, ":", icc_result$p.value, "\n") - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") - cat("First few rows:\n") - print(head(summary_data)) -} else { - cat("ERROR: summary_data is empty!\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -if(nrow(summary_data) > 0) { - for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - - # Debug: print what we're trying to access - cat("Row", i, "- trying to access columns:", paste(colnames(row), collapse = ", "), "\n") - cat("Raw alpha value:", row$raw_alpha, "\n") - - html_content <- paste0(html_content, sprintf("", - row[["Scale"]], row[["raw_alpha"]], row[["std.alpha"]], row[["G6(smc)"]], row[["average_r"]], row[["S/N"]], row[["ase"]], row[["mean"]], row[["sd"]], row[["median_r"]])) - } -} else { - cat("ERROR: No summary data to display!\n") - html_content <- paste0(html_content, "") -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
No data available
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - - # Format p-value appropriately - p_val <- row$p_Value - if(p_val < 0.001) { - p_display <- sprintf("%.2e", p_val) # Scientific notation for very small values - } else { - p_display <- sprintf("%.4f", p_val) # 4 decimal places for larger values - } - - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, p_display)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%s
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028170725.r b/.history/eohi2/reliability analysis_20251028170725.r deleted file mode 100644 index 0b8583e..0000000 --- a/.history/eohi2/reliability analysis_20251028170725.r +++ /dev/null @@ -1,274 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- columns:", paste(colnames(total_row), collapse = ", "), "\n") - - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - if(ncol(construct_data) == 0) { - construct_data <- data.frame(scale_mean) - colnames(construct_data)[1] <- scale_name - } else { - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - - # Debug: print actual p-value - cat("Actual p-value for", construct_name, ":", icc_result$p.value, "\n") - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") - cat("First few rows:\n") - print(head(summary_data)) -} else { - cat("ERROR: summary_data is empty!\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -if(nrow(summary_data) > 0) { - for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - - # Debug: print what we're trying to access - cat("Row", i, "- trying to access columns:", paste(colnames(row), collapse = ", "), "\n") - cat("Raw alpha value:", row$raw_alpha, "\n") - - html_content <- paste0(html_content, sprintf("", - row[["Scale"]], row[["raw_alpha"]], row[["std.alpha"]], row[["G6(smc)"]], row[["average_r"]], row[["S/N"]], row[["ase"]], row[["mean"]], row[["sd"]], row[["median_r"]])) - } -} else { - cat("ERROR: No summary data to display!\n") - html_content <- paste0(html_content, "") -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
No data available
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - - # Format p-value appropriately - p_val <- row$p_Value - if(p_val < 0.001) { - p_display <- sprintf("%.2e", p_val) # Scientific notation for very small values - } else { - p_display <- sprintf("%.4f", p_val) # 4 decimal places for larger values - } - - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, p_display)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%s
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028170741.r b/.history/eohi2/reliability analysis_20251028170741.r deleted file mode 100644 index 0b8583e..0000000 --- a/.history/eohi2/reliability analysis_20251028170741.r +++ /dev/null @@ -1,274 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- columns:", paste(colnames(total_row), collapse = ", "), "\n") - - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - if(ncol(construct_data) == 0) { - construct_data <- data.frame(scale_mean) - colnames(construct_data)[1] <- scale_name - } else { - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - - # Debug: print actual p-value - cat("Actual p-value for", construct_name, ":", icc_result$p.value, "\n") - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") - cat("First few rows:\n") - print(head(summary_data)) -} else { - cat("ERROR: summary_data is empty!\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -if(nrow(summary_data) > 0) { - for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - - # Debug: print what we're trying to access - cat("Row", i, "- trying to access columns:", paste(colnames(row), collapse = ", "), "\n") - cat("Raw alpha value:", row$raw_alpha, "\n") - - html_content <- paste0(html_content, sprintf("", - row[["Scale"]], row[["raw_alpha"]], row[["std.alpha"]], row[["G6(smc)"]], row[["average_r"]], row[["S/N"]], row[["ase"]], row[["mean"]], row[["sd"]], row[["median_r"]])) - } -} else { - cat("ERROR: No summary data to display!\n") - html_content <- paste0(html_content, "") -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
No data available
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - - # Format p-value appropriately - p_val <- row$p_Value - if(p_val < 0.001) { - p_display <- sprintf("%.2e", p_val) # Scientific notation for very small values - } else { - p_display <- sprintf("%.4f", p_val) # 4 decimal places for larger values - } - - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, p_display)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%s
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028170822.r b/.history/eohi2/reliability analysis_20251028170822.r deleted file mode 100644 index dcad3f5..0000000 --- a/.history/eohi2/reliability analysis_20251028170822.r +++ /dev/null @@ -1,280 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- columns:", paste(colnames(total_row), collapse = ", "), "\n") - - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - if(ncol(construct_data) == 0) { - construct_data <- data.frame(scale_mean) - colnames(construct_data)[1] <- scale_name - } else { - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - - # Debug: print actual p-value - cat("Actual p-value for", construct_name, ":", icc_result$p.value, "\n") - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") - cat("First few rows:\n") - print(head(summary_data)) -} else { - cat("ERROR: summary_data is empty!\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -if(nrow(summary_data) > 0) { - for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - - # Debug: print what we're trying to access - cat("Row", i, "- trying to access columns:", paste(colnames(row), collapse = ", "), "\n") - cat("Raw alpha value:", row$raw_alpha, "\n") - - html_content <- paste0(html_content, sprintf("", - row[["Scale"]], row[["raw_alpha"]], row[["std.alpha"]], row[["G6(smc)"]], row[["average_r"]], row[["S/N"]], row[["ase"]], row[["mean"]], row[["sd"]], row[["median_r"]])) - } -} else { - cat("ERROR: No summary data to display!\n") - html_content <- paste0(html_content, "") -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
No data available
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - - # Format p-value appropriately - p_val <- row$p_Value - - # Debug: print the actual p-value - cat("Debug - p-value for", row$Construct, ":", p_val, "\n") - - if(p_val < 1e-10) { - p_display <- "< 1e-10" # For extremely small values - } else if(p_val < 0.001) { - p_display <- sprintf("%.2e", p_val) # Scientific notation - } else { - p_display <- sprintf("%.4f", p_val) # 4 decimal places - } - - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, p_display)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%s
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028170831.r b/.history/eohi2/reliability analysis_20251028170831.r deleted file mode 100644 index dcad3f5..0000000 --- a/.history/eohi2/reliability analysis_20251028170831.r +++ /dev/null @@ -1,280 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- columns:", paste(colnames(total_row), collapse = ", "), "\n") - - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - if(ncol(construct_data) == 0) { - construct_data <- data.frame(scale_mean) - colnames(construct_data)[1] <- scale_name - } else { - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - - # Debug: print actual p-value - cat("Actual p-value for", construct_name, ":", icc_result$p.value, "\n") - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") - cat("First few rows:\n") - print(head(summary_data)) -} else { - cat("ERROR: summary_data is empty!\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -if(nrow(summary_data) > 0) { - for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - - # Debug: print what we're trying to access - cat("Row", i, "- trying to access columns:", paste(colnames(row), collapse = ", "), "\n") - cat("Raw alpha value:", row$raw_alpha, "\n") - - html_content <- paste0(html_content, sprintf("", - row[["Scale"]], row[["raw_alpha"]], row[["std.alpha"]], row[["G6(smc)"]], row[["average_r"]], row[["S/N"]], row[["ase"]], row[["mean"]], row[["sd"]], row[["median_r"]])) - } -} else { - cat("ERROR: No summary data to display!\n") - html_content <- paste0(html_content, "") -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
No data available
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - - # Format p-value appropriately - p_val <- row$p_Value - - # Debug: print the actual p-value - cat("Debug - p-value for", row$Construct, ":", p_val, "\n") - - if(p_val < 1e-10) { - p_display <- "< 1e-10" # For extremely small values - } else if(p_val < 0.001) { - p_display <- sprintf("%.2e", p_val) # Scientific notation - } else { - p_display <- sprintf("%.4f", p_val) # 4 decimal places - } - - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, p_display)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%s
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028170832.r b/.history/eohi2/reliability analysis_20251028170832.r deleted file mode 100644 index dcad3f5..0000000 --- a/.history/eohi2/reliability analysis_20251028170832.r +++ /dev/null @@ -1,280 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- columns:", paste(colnames(total_row), collapse = ", "), "\n") - - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - if(ncol(construct_data) == 0) { - construct_data <- data.frame(scale_mean) - colnames(construct_data)[1] <- scale_name - } else { - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - - # Debug: print actual p-value - cat("Actual p-value for", construct_name, ":", icc_result$p.value, "\n") - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") - cat("First few rows:\n") - print(head(summary_data)) -} else { - cat("ERROR: summary_data is empty!\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -if(nrow(summary_data) > 0) { - for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - - # Debug: print what we're trying to access - cat("Row", i, "- trying to access columns:", paste(colnames(row), collapse = ", "), "\n") - cat("Raw alpha value:", row$raw_alpha, "\n") - - html_content <- paste0(html_content, sprintf("", - row[["Scale"]], row[["raw_alpha"]], row[["std.alpha"]], row[["G6(smc)"]], row[["average_r"]], row[["S/N"]], row[["ase"]], row[["mean"]], row[["sd"]], row[["median_r"]])) - } -} else { - cat("ERROR: No summary data to display!\n") - html_content <- paste0(html_content, "") -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
No data available
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - - # Format p-value appropriately - p_val <- row$p_Value - - # Debug: print the actual p-value - cat("Debug - p-value for", row$Construct, ":", p_val, "\n") - - if(p_val < 1e-10) { - p_display <- "< 1e-10" # For extremely small values - } else if(p_val < 0.001) { - p_display <- sprintf("%.2e", p_val) # Scientific notation - } else { - p_display <- sprintf("%.4f", p_val) # 4 decimal places - } - - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, p_display)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%s
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028170921.r b/.history/eohi2/reliability analysis_20251028170921.r deleted file mode 100644 index f62ee62..0000000 --- a/.history/eohi2/reliability analysis_20251028170921.r +++ /dev/null @@ -1,289 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- columns:", paste(colnames(total_row), collapse = ", "), "\n") - - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - if(ncol(construct_data) == 0) { - construct_data <- data.frame(scale_mean) - colnames(construct_data)[1] <- scale_name - } else { - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - - # Debug: print actual p-value and all ICC results - cat("ICC results for", construct_name, ":\n") - cat(" ICC value:", icc_result$value, "\n") - cat(" F value:", icc_result$Fvalue, "\n") - cat(" p-value:", icc_result$p.value, "\n") - cat(" p-value class:", class(icc_result$p.value), "\n") - cat(" p-value length:", length(icc_result$p.value), "\n") - cat(" p-value is.na:", is.na(icc_result$p.value), "\n") - cat(" p-value is.null:", is.null(icc_result$p.value), "\n") - cat(" p-value == 0:", icc_result$p.value == 0, "\n") - cat("\n") - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") - cat("First few rows:\n") - print(head(summary_data)) -} else { - cat("ERROR: summary_data is empty!\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -if(nrow(summary_data) > 0) { - for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - - # Debug: print what we're trying to access - cat("Row", i, "- trying to access columns:", paste(colnames(row), collapse = ", "), "\n") - cat("Raw alpha value:", row$raw_alpha, "\n") - - html_content <- paste0(html_content, sprintf("", - row[["Scale"]], row[["raw_alpha"]], row[["std.alpha"]], row[["G6(smc)"]], row[["average_r"]], row[["S/N"]], row[["ase"]], row[["mean"]], row[["sd"]], row[["median_r"]])) - } -} else { - cat("ERROR: No summary data to display!\n") - html_content <- paste0(html_content, "") -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
No data available
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - - # Format p-value appropriately - p_val <- row$p_Value - - # Debug: print the actual p-value - cat("Debug - p-value for", row$Construct, ":", p_val, "\n") - - if(p_val < 1e-10) { - p_display <- "< 1e-10" # For extremely small values - } else if(p_val < 0.001) { - p_display <- sprintf("%.2e", p_val) # Scientific notation - } else { - p_display <- sprintf("%.4f", p_val) # 4 decimal places - } - - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, p_display)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%s
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028170923.r b/.history/eohi2/reliability analysis_20251028170923.r deleted file mode 100644 index f62ee62..0000000 --- a/.history/eohi2/reliability analysis_20251028170923.r +++ /dev/null @@ -1,289 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- columns:", paste(colnames(total_row), collapse = ", "), "\n") - - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - if(ncol(construct_data) == 0) { - construct_data <- data.frame(scale_mean) - colnames(construct_data)[1] <- scale_name - } else { - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - - # Debug: print actual p-value and all ICC results - cat("ICC results for", construct_name, ":\n") - cat(" ICC value:", icc_result$value, "\n") - cat(" F value:", icc_result$Fvalue, "\n") - cat(" p-value:", icc_result$p.value, "\n") - cat(" p-value class:", class(icc_result$p.value), "\n") - cat(" p-value length:", length(icc_result$p.value), "\n") - cat(" p-value is.na:", is.na(icc_result$p.value), "\n") - cat(" p-value is.null:", is.null(icc_result$p.value), "\n") - cat(" p-value == 0:", icc_result$p.value == 0, "\n") - cat("\n") - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") - cat("First few rows:\n") - print(head(summary_data)) -} else { - cat("ERROR: summary_data is empty!\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -if(nrow(summary_data) > 0) { - for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - - # Debug: print what we're trying to access - cat("Row", i, "- trying to access columns:", paste(colnames(row), collapse = ", "), "\n") - cat("Raw alpha value:", row$raw_alpha, "\n") - - html_content <- paste0(html_content, sprintf("", - row[["Scale"]], row[["raw_alpha"]], row[["std.alpha"]], row[["G6(smc)"]], row[["average_r"]], row[["S/N"]], row[["ase"]], row[["mean"]], row[["sd"]], row[["median_r"]])) - } -} else { - cat("ERROR: No summary data to display!\n") - html_content <- paste0(html_content, "") -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
No data available
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - - # Format p-value appropriately - p_val <- row$p_Value - - # Debug: print the actual p-value - cat("Debug - p-value for", row$Construct, ":", p_val, "\n") - - if(p_val < 1e-10) { - p_display <- "< 1e-10" # For extremely small values - } else if(p_val < 0.001) { - p_display <- sprintf("%.2e", p_val) # Scientific notation - } else { - p_display <- sprintf("%.4f", p_val) # 4 decimal places - } - - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, p_display)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%s
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028170935.r b/.history/eohi2/reliability analysis_20251028170935.r deleted file mode 100644 index f62ee62..0000000 --- a/.history/eohi2/reliability analysis_20251028170935.r +++ /dev/null @@ -1,289 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- columns:", paste(colnames(total_row), collapse = ", "), "\n") - - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - if(ncol(construct_data) == 0) { - construct_data <- data.frame(scale_mean) - colnames(construct_data)[1] <- scale_name - } else { - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - - # Debug: print actual p-value and all ICC results - cat("ICC results for", construct_name, ":\n") - cat(" ICC value:", icc_result$value, "\n") - cat(" F value:", icc_result$Fvalue, "\n") - cat(" p-value:", icc_result$p.value, "\n") - cat(" p-value class:", class(icc_result$p.value), "\n") - cat(" p-value length:", length(icc_result$p.value), "\n") - cat(" p-value is.na:", is.na(icc_result$p.value), "\n") - cat(" p-value is.null:", is.null(icc_result$p.value), "\n") - cat(" p-value == 0:", icc_result$p.value == 0, "\n") - cat("\n") - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") - cat("First few rows:\n") - print(head(summary_data)) -} else { - cat("ERROR: summary_data is empty!\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -if(nrow(summary_data) > 0) { - for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - - # Debug: print what we're trying to access - cat("Row", i, "- trying to access columns:", paste(colnames(row), collapse = ", "), "\n") - cat("Raw alpha value:", row$raw_alpha, "\n") - - html_content <- paste0(html_content, sprintf("", - row[["Scale"]], row[["raw_alpha"]], row[["std.alpha"]], row[["G6(smc)"]], row[["average_r"]], row[["S/N"]], row[["ase"]], row[["mean"]], row[["sd"]], row[["median_r"]])) - } -} else { - cat("ERROR: No summary data to display!\n") - html_content <- paste0(html_content, "") -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
No data available
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - - # Format p-value appropriately - p_val <- row$p_Value - - # Debug: print the actual p-value - cat("Debug - p-value for", row$Construct, ":", p_val, "\n") - - if(p_val < 1e-10) { - p_display <- "< 1e-10" # For extremely small values - } else if(p_val < 0.001) { - p_display <- sprintf("%.2e", p_val) # Scientific notation - } else { - p_display <- sprintf("%.4f", p_val) # 4 decimal places - } - - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, p_display)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%s
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability analysis_20251028171635.r b/.history/eohi2/reliability analysis_20251028171635.r deleted file mode 100644 index f62ee62..0000000 --- a/.history/eohi2/reliability analysis_20251028171635.r +++ /dev/null @@ -1,289 +0,0 @@ -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") - -options(scipen = 999) - -df <- read.csv("eohi2.csv") -library(psych) -library(dplyr) -library(knitr) -library(irr) - -# Your named variable sets (replace df with your actual dataframe name) -present_pref_vars <- c("present_pref_read", "present_pref_music", "present_pref_tv", "present_pref_nap", "present_pref_travel") -past_5_pref_vars <- c("past_5_pref_read", "past_5_pref_music", "past_5_pref_TV", "past_5_pref_nap", "past_5_pref_travel") -past_10_pref_vars <- c("past_10_pref_read", "past_10_pref_music", "past_10_pref_TV", "past_10_pref_nap", "past_10_pref_travel") -fut_5_pref_vars <- c("fut_5_pref_read", "fut_5_pref_music", "fut_5_pref_TV", "fut_5_pref_nap", "fut_5_pref_travel") -fut_10_pref_vars <- c("fut_10_pref_read", "fut_10_pref_music", "fut_10_pref_TV", "fut_10_pref_nap", "fut_10_pref_travel") - -present_pers_vars <- c("present_pers_extravert", "present_pers_critical", "present_pers_dependable", "present_pers_anxious", "present_pers_complex") -past_5_pers_vars <- c("past_5_pers_extravert", "past_5_pers_critical", "past_5_pers_dependable", "past_5_pers_anxious", "past_5_pers_complex") -past_10_pers_vars <- c("past_10_pers_extravert", "past_10_pers_critical", "past_10_pers_dependable", "past_10_pers_anxious", "past_10_pers_complex") -fut_5_pers_vars <- c("fut_5_pers_extravert", "fut_5_pers_critical", "fut_5_pers_dependable", "fut_5_pers_anxious", "fut_5_pers_complex") -fut_10_pers_vars <- c("fut_10_pers_extravert", "fut_10_pers_critical", "fut_10_pers_dependable", "fut_10_pers_anxious", "fut_10_pers_complex") - -present_val_vars <- c("present_val_obey", "present_val_trad", "present_val_opinion", "present_val_performance", "present_val_justice") -past_5_val_vars <- c("past_5_val_obey", "past_5_val_trad", "past_5_val_opinion", "past_5_val_performance", "past_5_val_justice") -past_10_val_vars <- c("past_10_val_obey", "past_10_val_trad", "past_10_val_opinion", "past_10_val_performance", "past_10_val_justice") -fut_5_val_vars <- c("fut_5_val_obey", "fut_5_val_trad", "fut_5_val_opinion", "fut_5_val_performance", "fut_5_val_justice") -fut_10_val_vars <- c("fut_10_val_obey", "fut_10_val_trad", "fut_10_val_opinion", "fut_10_val_performance", "fut_10_val_justice") - -all_scales <- list( - "Present_Preferences" = present_pref_vars, - "Past5_Preferences" = past_5_pref_vars, - "Past10_Preferences" = past_10_pref_vars, - "Fut5_Preferences" = fut_5_pref_vars, - "Fut10_Preferences" = fut_10_pref_vars, - "Present_Personality" = present_pers_vars, - "Past5_Personality" = past_5_pers_vars, - "Past10_Personality" = past_10_pers_vars, - "Fut5_Personality" = fut_5_pers_vars, - "Fut10_Personality" = fut_10_pers_vars, - "Present_Values" = present_val_vars, - "Past5_Values" = past_5_val_vars, - "Past10_Values" = past_10_val_vars, - "Fut5_Values" = fut_5_val_vars, - "Fut10_Values" = fut_10_val_vars -) - -# Reliability analysis loop -alpha_results <- list() -summary_data <- data.frame() -item_data <- data.frame() - -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - # Only run if there is more than one column present - if(ncol(scale_data) > 1) { - alpha_out <- psych::alpha(scale_data, check.keys = TRUE) - alpha_results[[scale_name]] <- alpha_out - - # Print full output for diagnostics - cat("\n----------", scale_name, "----------\n") - print(alpha_out$total) - print(alpha_out$item.stats) - print(alpha_out$alpha.drop) - - # Collect summary data for HTML - fix the data collection - total_row <- as.data.frame(alpha_out$total) - total_row$Scale <- scale_name - - # Debug: print what we're collecting - cat("Collecting data for", scale_name, "- columns:", paste(colnames(total_row), collapse = ", "), "\n") - - if(nrow(summary_data) == 0) { - summary_data <- total_row - } else { - summary_data <- rbind(summary_data, total_row) - } - - # Collect item data for HTML - item_row <- alpha_out$item.stats - alpha_drop_row <- alpha_out$alpha.drop - item_row$Scale <- scale_name - item_row$AlphaIfDropped <- alpha_drop_row$raw_alpha - item_data <- rbind(item_data, item_row) - } -} - -# Calculate ICC(2,1) across time points for each construct -icc_data <- data.frame() - -# Define construct groups -constructs <- list( - "Preferences" = c("Present_Preferences", "Past5_Preferences", "Past10_Preferences", "Fut5_Preferences", "Fut10_Preferences"), - "Personality" = c("Present_Personality", "Past5_Personality", "Past10_Personality", "Fut5_Personality", "Fut10_Personality"), - "Values" = c("Present_Values", "Past5_Values", "Past10_Values", "Fut5_Values", "Fut10_Values") -) - -for(construct_name in names(constructs)) { - construct_scales <- constructs[[construct_name]] - - # Get data for all time points of this construct - construct_data <- data.frame() - for(scale_name in construct_scales) { - if(scale_name %in% names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - # Calculate mean score for this time point - scale_mean <- rowMeans(scale_data, na.rm = TRUE) - if(ncol(construct_data) == 0) { - construct_data <- data.frame(scale_mean) - colnames(construct_data)[1] <- scale_name - } else { - construct_data <- cbind(construct_data, scale_mean) - colnames(construct_data)[ncol(construct_data)] <- scale_name - } - } - } - } - - # Calculate ICC(2,1) across time points - if(ncol(construct_data) > 1) { - tryCatch({ - icc_result <- icc(construct_data, model = "twoway", type = "consistency", unit = "single") - cat("ICC(2,1) for", construct_name, "across time points:", round(icc_result$value, 4), "\n") - - # Collect ICC data for HTML - icc_row <- data.frame( - Construct = construct_name, - ICC_2_1 = icc_result$value, - ICC_CI_Lower = icc_result$lbound, - ICC_CI_Upper = icc_result$ubound, - F_Value = icc_result$Fvalue, - p_Value = icc_result$p.value, - stringsAsFactors = FALSE - ) - - # Debug: print actual p-value and all ICC results - cat("ICC results for", construct_name, ":\n") - cat(" ICC value:", icc_result$value, "\n") - cat(" F value:", icc_result$Fvalue, "\n") - cat(" p-value:", icc_result$p.value, "\n") - cat(" p-value class:", class(icc_result$p.value), "\n") - cat(" p-value length:", length(icc_result$p.value), "\n") - cat(" p-value is.na:", is.na(icc_result$p.value), "\n") - cat(" p-value is.null:", is.null(icc_result$p.value), "\n") - cat(" p-value == 0:", icc_result$p.value == 0, "\n") - cat("\n") - if(nrow(icc_data) == 0) { - icc_data <- icc_row - } else { - icc_data <- rbind(icc_data, icc_row) - } - }, error = function(e) { - cat("ICC calculation failed for", construct_name, ":", e$message, "\n") - }) - } -} - -# Debug: check summary_data -cat("Final summary_data has", nrow(summary_data), "rows\n") -if(nrow(summary_data) > 0) { - cat("Column names:", paste(colnames(summary_data), collapse = ", "), "\n") - cat("First few rows:\n") - print(head(summary_data)) -} else { - cat("ERROR: summary_data is empty!\n") -} - -# Create simple HTML report -html_content <- paste0(" - - - Reliability Analysis Results - - - -

Reliability Analysis Results

-

Generated on: ", Sys.time(), "

-

Overall Scale Statistics

-
- - ") - -if(nrow(summary_data) > 0) { - for(i in 1:nrow(summary_data)) { - row <- summary_data[i,] - - # Debug: print what we're trying to access - cat("Row", i, "- trying to access columns:", paste(colnames(row), collapse = ", "), "\n") - cat("Raw alpha value:", row$raw_alpha, "\n") - - html_content <- paste0(html_content, sprintf("", - row[["Scale"]], row[["raw_alpha"]], row[["std.alpha"]], row[["G6(smc)"]], row[["average_r"]], row[["S/N"]], row[["ase"]], row[["mean"]], row[["sd"]], row[["median_r"]])) - } -} else { - cat("ERROR: No summary data to display!\n") - html_content <- paste0(html_content, "") -} - -html_content <- paste0(html_content, "
ScaleRaw AlphaStd AlphaG6(SMC)Average rS/NASEMeanSDMedian r
%s%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
No data available
") - -# Add item statistics with alpha if dropped -html_content <- paste0(html_content, "

Item Statistics

- - ") - -for(i in 1:nrow(item_data)) { - row <- item_data[i,] - html_content <- paste0(html_content, sprintf("", - row$Scale, rownames(row), row$n, row$raw.r, row$std.r, row$r.cor, row$r.drop, row$mean, row$sd, row$AlphaIfDropped)) -} - -html_content <- paste0(html_content, "
ScaleItemnRaw rStd rr.corr.dropMeanSDAlpha if Dropped
%s%s%.0f%.4f%.4f%.4f%.4f%.4f%.4f%.4f
") - -# Add ICC(2,1) results across time points -if(nrow(icc_data) > 0) { - html_content <- paste0(html_content, "

ICC(2,1) Results (Temporal Stability)

- - ") - - for(i in 1:nrow(icc_data)) { - row <- icc_data[i,] - - # Format p-value appropriately - p_val <- row$p_Value - - # Debug: print the actual p-value - cat("Debug - p-value for", row$Construct, ":", p_val, "\n") - - if(p_val < 1e-10) { - p_display <- "< 1e-10" # For extremely small values - } else if(p_val < 0.001) { - p_display <- sprintf("%.2e", p_val) # Scientific notation - } else { - p_display <- sprintf("%.4f", p_val) # 4 decimal places - } - - html_content <- paste0(html_content, sprintf("", - row$Construct, row$ICC_2_1, row$ICC_CI_Lower, row$ICC_CI_Upper, row$F_Value, p_display)) - } - - html_content <- paste0(html_content, "
ConstructICC(2,1)95% CI Lower95% CI UpperF Valuep Value
%s%.4f%.4f%.4f%.4f%s
") -} - -html_content <- paste0(html_content, " - -") - -# Write HTML file -writeLines(html_content, "reliability_analysis_report.html") - - -# Check for reversed items -cat("\n=== REVERSED ITEMS CHECK ===\n") -for(scale_name in names(all_scales)) { - vars <- all_scales[[scale_name]] - scale_data <- df %>% select(all_of(vars)) - - if(ncol(scale_data) > 1) { - alpha_out <- alpha_results[[scale_name]] - - # Check if any items were reversed by looking at the keys - if(!is.null(alpha_out$keys)) { - keys_df <- as.data.frame(alpha_out$keys) - reversed_items <- rownames(keys_df)[keys_df[,1] < 0] - if(length(reversed_items) > 0) { - cat(scale_name, "reversed items:", paste(reversed_items, collapse = ", "), "\n") - } else { - cat(scale_name, ": No items reversed\n") - } - } else { - cat(scale_name, ": No keys available\n") - } - } -} - diff --git a/.history/eohi2/reliability_summary_table_20251028144409.csv b/.history/eohi2/reliability_summary_table_20251028144409.csv deleted file mode 100644 index 7aaaece..0000000 --- a/.history/eohi2/reliability_summary_table_20251028144409.csv +++ /dev/null @@ -1,16 +0,0 @@ -Scale,Time_Frame,Domain,Raw_Alpha,Std_Alpha,G6_SMC,Average_r,S_N,ASE,Mean,SD,Median_r -Present_Preferences,Present,Preferences,0.5248372,0.5538292,0.5041749,0.1988841,1.241294,0.0336335,1.582004,0.8713115,0.1965662 -Past5_Preferences,Past5,Preferences,0.5649127,0.5893789,0.5467387,0.2230397,1.435335,0.03089417,1.410634,0.9559399,0.2104482 -Past10_Preferences,Past10,Preferences,0.488938,0.523742,0.4772804,0.1802878,1.099702,0.03649826,1.397546,0.9004519,0.1690398 -Fut5_Preferences,Fut5,Preferences,0.5692099,0.587553,0.5414701,0.2217358,1.424554,0.03082567,1.587321,0.9036826,0.2250345 -Fut10_Preferences,Fut10,Preferences,0.5698939,0.5847131,0.5479539,0.2197222,1.407974,0.03076925,1.58773,0.8931506,0.2489621 -Present_Personality,Present,Personality,0.5150363,0.5266182,0.5589141,0.1819987,1.11246,0.0351599,0.6861963,0.9615554,0.2041057 -Past5_Personality,Past5,Personality,0.6124713,0.619984,0.6719927,0.2460191,1.631468,0.0287073,0.5308793,1.114899,0.2033224 -Past10_Personality,Past10,Personality,0.5912415,0.6040201,0.6557158,0.2337612,1.525381,0.03034508,0.5083845,1.11603,0.2218243 -Fut5_Personality,Fut5,Personality,0.5110326,0.5338022,0.5987905,0.186332,1.145012,0.03652723,0.8159509,0.9646153,0.1559173 -Fut10_Personality,Fut10,Personality,0.6090355,0.6252252,0.685506,0.2501803,1.668269,0.02925517,0.7873211,1.045719,0.2084592 -Present_Values,Present,Values,0.5355957,0.5489562,0.5671587,0.1957639,1.217079,0.03206684,1.280982,0.8331172,0.2003739 -Past5_Values,Past5,Values,0.5392641,0.5606175,0.5585066,0.2033042,1.275921,0.03308046,1.248671,0.8889135,0.1781123 -Past10_Values,Past10,Values,0.598008,0.6176378,0.6277172,0.2441788,1.615321,0.028953,1.22863,0.957626,0.22824 -Fut5_Values,Fut5,Values,0.5616573,0.5904551,0.6070898,0.2238116,1.441735,0.03130214,1.234356,0.9067855,0.2086639 -Fut10_Values,Fut10,Values,0.5247822,0.5552694,0.5921832,0.1998146,1.248552,0.03421231,1.234356,0.8939492,0.1716286 diff --git a/.history/eohi2/reliability_summary_table_20251028144438.csv b/.history/eohi2/reliability_summary_table_20251028144438.csv deleted file mode 100644 index 7aaaece..0000000 --- a/.history/eohi2/reliability_summary_table_20251028144438.csv +++ /dev/null @@ -1,16 +0,0 @@ -Scale,Time_Frame,Domain,Raw_Alpha,Std_Alpha,G6_SMC,Average_r,S_N,ASE,Mean,SD,Median_r -Present_Preferences,Present,Preferences,0.5248372,0.5538292,0.5041749,0.1988841,1.241294,0.0336335,1.582004,0.8713115,0.1965662 -Past5_Preferences,Past5,Preferences,0.5649127,0.5893789,0.5467387,0.2230397,1.435335,0.03089417,1.410634,0.9559399,0.2104482 -Past10_Preferences,Past10,Preferences,0.488938,0.523742,0.4772804,0.1802878,1.099702,0.03649826,1.397546,0.9004519,0.1690398 -Fut5_Preferences,Fut5,Preferences,0.5692099,0.587553,0.5414701,0.2217358,1.424554,0.03082567,1.587321,0.9036826,0.2250345 -Fut10_Preferences,Fut10,Preferences,0.5698939,0.5847131,0.5479539,0.2197222,1.407974,0.03076925,1.58773,0.8931506,0.2489621 -Present_Personality,Present,Personality,0.5150363,0.5266182,0.5589141,0.1819987,1.11246,0.0351599,0.6861963,0.9615554,0.2041057 -Past5_Personality,Past5,Personality,0.6124713,0.619984,0.6719927,0.2460191,1.631468,0.0287073,0.5308793,1.114899,0.2033224 -Past10_Personality,Past10,Personality,0.5912415,0.6040201,0.6557158,0.2337612,1.525381,0.03034508,0.5083845,1.11603,0.2218243 -Fut5_Personality,Fut5,Personality,0.5110326,0.5338022,0.5987905,0.186332,1.145012,0.03652723,0.8159509,0.9646153,0.1559173 -Fut10_Personality,Fut10,Personality,0.6090355,0.6252252,0.685506,0.2501803,1.668269,0.02925517,0.7873211,1.045719,0.2084592 -Present_Values,Present,Values,0.5355957,0.5489562,0.5671587,0.1957639,1.217079,0.03206684,1.280982,0.8331172,0.2003739 -Past5_Values,Past5,Values,0.5392641,0.5606175,0.5585066,0.2033042,1.275921,0.03308046,1.248671,0.8889135,0.1781123 -Past10_Values,Past10,Values,0.598008,0.6176378,0.6277172,0.2441788,1.615321,0.028953,1.22863,0.957626,0.22824 -Fut5_Values,Fut5,Values,0.5616573,0.5904551,0.6070898,0.2238116,1.441735,0.03130214,1.234356,0.9067855,0.2086639 -Fut10_Values,Fut10,Values,0.5247822,0.5552694,0.5921832,0.1998146,1.248552,0.03421231,1.234356,0.8939492,0.1716286 diff --git a/.history/eohi2/reliability_summary_table_20251028173027.csv b/.history/eohi2/reliability_summary_table_20251028173027.csv deleted file mode 100644 index 7aaaece..0000000 --- a/.history/eohi2/reliability_summary_table_20251028173027.csv +++ /dev/null @@ -1,16 +0,0 @@ -Scale,Time_Frame,Domain,Raw_Alpha,Std_Alpha,G6_SMC,Average_r,S_N,ASE,Mean,SD,Median_r -Present_Preferences,Present,Preferences,0.5248372,0.5538292,0.5041749,0.1988841,1.241294,0.0336335,1.582004,0.8713115,0.1965662 -Past5_Preferences,Past5,Preferences,0.5649127,0.5893789,0.5467387,0.2230397,1.435335,0.03089417,1.410634,0.9559399,0.2104482 -Past10_Preferences,Past10,Preferences,0.488938,0.523742,0.4772804,0.1802878,1.099702,0.03649826,1.397546,0.9004519,0.1690398 -Fut5_Preferences,Fut5,Preferences,0.5692099,0.587553,0.5414701,0.2217358,1.424554,0.03082567,1.587321,0.9036826,0.2250345 -Fut10_Preferences,Fut10,Preferences,0.5698939,0.5847131,0.5479539,0.2197222,1.407974,0.03076925,1.58773,0.8931506,0.2489621 -Present_Personality,Present,Personality,0.5150363,0.5266182,0.5589141,0.1819987,1.11246,0.0351599,0.6861963,0.9615554,0.2041057 -Past5_Personality,Past5,Personality,0.6124713,0.619984,0.6719927,0.2460191,1.631468,0.0287073,0.5308793,1.114899,0.2033224 -Past10_Personality,Past10,Personality,0.5912415,0.6040201,0.6557158,0.2337612,1.525381,0.03034508,0.5083845,1.11603,0.2218243 -Fut5_Personality,Fut5,Personality,0.5110326,0.5338022,0.5987905,0.186332,1.145012,0.03652723,0.8159509,0.9646153,0.1559173 -Fut10_Personality,Fut10,Personality,0.6090355,0.6252252,0.685506,0.2501803,1.668269,0.02925517,0.7873211,1.045719,0.2084592 -Present_Values,Present,Values,0.5355957,0.5489562,0.5671587,0.1957639,1.217079,0.03206684,1.280982,0.8331172,0.2003739 -Past5_Values,Past5,Values,0.5392641,0.5606175,0.5585066,0.2033042,1.275921,0.03308046,1.248671,0.8889135,0.1781123 -Past10_Values,Past10,Values,0.598008,0.6176378,0.6277172,0.2441788,1.615321,0.028953,1.22863,0.957626,0.22824 -Fut5_Values,Fut5,Values,0.5616573,0.5904551,0.6070898,0.2238116,1.441735,0.03130214,1.234356,0.9067855,0.2086639 -Fut10_Values,Fut10,Values,0.5247822,0.5552694,0.5921832,0.1998146,1.248552,0.03421231,1.234356,0.8939492,0.1716286 diff --git a/.history/eohi2/verify_means_20251008115109.R b/.history/eohi2/verify_means_20251008115109.R deleted file mode 100644 index 32f8e0d..0000000 --- a/.history/eohi2/verify_means_20251008115109.R +++ /dev/null @@ -1,68 +0,0 @@ -options(scipen = 999) -setwd("C:/Users/irina/Documents/DND/EOHI/eohi2") -data <- read.csv("eohi2.csv") - -cat("\n================================================================================\n") -cat("VERIFICATION SUMMARY: ALL 11 MEAN VARIABLES (Rows 1-2)\n") -cat("================================================================================\n\n") - -for (i in 1:2) { - cat("-------------------------- ROW", i, "----------------------------\n\n") - - cat("NARROW-SCOPE MEANS (15 items each):\n") - cat("-----------------------------------\n") - - # Set 1: NPast_5_mean - v1 <- as.numeric(data[i, c('NPast_5_pref_read','NPast_5_pref_music','NPast_5_pref_TV','NPast_5_pref_nap','NPast_5_pref_travel','NPast_5_pers_extravert','NPast_5_pers_critical','NPast_5_pers_dependable','NPast_5_pers_anxious','NPast_5_pers_complex','NPast_5_val_obey','NPast_5_val_trad','NPast_5_val_opinion','NPast_5_val_performance','NPast_5_val_justice')]) - cat(sprintf("1. NPast_5_mean: Sum=%2d, Calc=%.10f, Stored=%.10f\n", sum(v1), mean(v1), data[i,'NPast_5_mean'])) - - # Set 2: NPast_10_mean - v2 <- as.numeric(data[i, c('NPast_10_pref_read','NPast_10_pref_music','NPast_10_pref_TV','NPast_10_pref_nap','NPast_10_pref_travel','NPast_10_pers_extravert','NPast_10_pers_critical','NPast_10_pers_dependable','NPast_10_pers_anxious','NPast_10_pers_complex','NPast_10_val_obey','NPast_10_val_trad','NPast_10_val_opinion','NPast_10_val_performance','NPast_10_val_justice')]) - cat(sprintf("2. NPast_10_mean: Sum=%2d, Calc=%.10f, Stored=%.10f\n", sum(v2), mean(v2), data[i,'NPast_10_mean'])) - - # Set 3: NFut_5_mean - v3 <- as.numeric(data[i, c('NFut_5_pref_read','NFut_5_pref_music','NFut_5_pref_TV','NFut_5_pref_nap','NFut_5_pref_travel','NFut_5_pers_extravert','NFut_5_pers_critical','NFut_5_pers_dependable','NFut_5_pers_anxious','NFut_5_pers_complex','NFut_5_val_obey','NFut_5_val_trad','NFut_5_val_opinion','NFut_5_val_performance','NFut_5_val_justice')]) - cat(sprintf("3. NFut_5_mean: Sum=%2d, Calc=%.10f, Stored=%.10f\n", sum(v3), mean(v3), data[i,'NFut_5_mean'])) - - # Set 4: NFut_10_mean - v4 <- as.numeric(data[i, c('NFut_10_pref_read','NFut_10_pref_music','NFut_10_pref_TV','NFut_10_pref_nap','NFut_10_pref_travel','NFut_10_pers_extravert','NFut_10_pers_critical','NFut_10_pers_dependable','NFut_10_pers_anxious','NFut_10_pers_complex','NFut_10_val_obey','NFut_10_val_trad','NFut_10_val_opinion','NFut_10_val_performance','NFut_10_val_justice')]) - cat(sprintf("4. NFut_10_mean: Sum=%2d, Calc=%.10f, Stored=%.10f\n", sum(v4), mean(v4), data[i,'NFut_10_mean'])) - - # Set 5: X5.10past_mean - v5 <- as.numeric(data[i, c('X5.10past_pref_read','X5.10past_pref_music','X5.10past_pref_TV','X5.10past_pref_nap','X5.10past_pref_travel','X5.10past_pers_extravert','X5.10past_pers_critical','X5.10past_pers_dependable','X5.10past_pers_anxious','X5.10past_pers_complex','X5.10past_val_obey','X5.10past_val_trad','X5.10past_val_opinion','X5.10past_val_performance','X5.10past_val_justice')]) - cat(sprintf("5. X5.10past_mean: Sum=%2d, Calc=%.10f, Stored=%.10f\n", sum(v5), mean(v5), data[i,'X5.10past_mean'])) - - # Set 6: X5.10fut_mean - v6 <- as.numeric(data[i, c('X5.10fut_pref_read','X5.10fut_pref_music','X5.10fut_pref_TV','X5.10fut_pref_nap','X5.10fut_pref_travel','X5.10fut_pers_extravert','X5.10fut_pers_critical','X5.10fut_pers_dependable','X5.10fut_pers_anxious','X5.10fut_pers_complex','X5.10fut_val_obey','X5.10fut_val_trad','X5.10fut_val_opinion','X5.10fut_val_performance','X5.10fut_val_justice')]) - cat(sprintf("6. X5.10fut_mean: Sum=%2d, Calc=%.10f, Stored=%.10f\n", sum(v6), mean(v6), data[i,'X5.10fut_mean'])) - - cat("\nGLOBAL-SCOPE MEANS (30 items each):\n") - cat("-----------------------------------\n") - - # Set 7: NPast_global_mean - v7 <- c(v1, v2) - cat(sprintf("7. NPast_global: Sum=%2d, Calc=%.10f, Stored=%.10f\n", sum(v7), mean(v7), data[i,'NPast_global_mean'])) - - # Set 8: NFut_global_mean - v8 <- c(v3, v4) - cat(sprintf("8. NFut_global: Sum=%2d, Calc=%.10f, Stored=%.10f\n", sum(v8), mean(v8), data[i,'NFut_global_mean'])) - - # Set 9: X5.10_global_mean - v9 <- c(v5, v6) - cat(sprintf("9. X5.10_global: Sum=%2d, Calc=%.10f, Stored=%.10f\n", sum(v9), mean(v9), data[i,'X5.10_global_mean'])) - - # Set 10: N5_global_mean - v10 <- c(v1, v3) - cat(sprintf("10. N5_global: Sum=%2d, Calc=%.10f, Stored=%.10f\n", sum(v10), mean(v10), data[i,'N5_global_mean'])) - - # Set 11: N10_global_mean - v11 <- c(v2, v4) - cat(sprintf("11. N10_global: Sum=%2d, Calc=%.10f, Stored=%.10f\n", sum(v11), mean(v11), data[i,'N10_global_mean'])) - - cat("\n") -} - -cat("================================================================================\n") -cat("ALL CALCULATIONS VERIFIED!\n") -cat("================================================================================\n") - diff --git a/.history/mixed anova - domain means_20250912124308.r b/.history/mixed anova - domain means_20250912124308.r deleted file mode 100644 index 141675c..0000000 --- a/.history/mixed anova - domain means_20250912124308.r +++ /dev/null @@ -1,572 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -cat("\nChecking available domain mean columns:\n") -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - cat("Warning: Missing variables:", paste(missing_vars, collapse = ", "), "\n") -} else { - cat("All required domain mean variables found!\n") -} - -# ============================================================================= -# STEP 1: DATA PIVOTING TO LONG FORMAT -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 1: DATA PIVOTING TO LONG FORMAT\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -cat("Domain mapping:\n") -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - cat("Warning: Variable", var_name, "not found in data\n") - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "GROUP", "TEMPORAL_DO", "ITEM_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$GROUP <- as.factor(long_data$GROUP) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - long_data$ITEM_DO <- as.factor(long_data$ITEM_DO) - - return(long_data) -} - -# Pivot data to long format -cat("\nPivoting data to long format...\n") -long_data <- pivot_domain_means(data, domain_mapping) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique participants:", length(unique(long_data$pID)), "\n") -cat("TIME levels:", paste(levels(long_data$TIME), collapse = ", "), "\n") -cat("DOMAIN levels:", paste(levels(long_data$DOMAIN), collapse = ", "), "\n") - -# Display structure and sample -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -# Show example data for one participant -cat("\nExample: Participant 1 across all domains and times:\n") -participant_1_data <- long_data[long_data$pID == 1, c("pID", "GROUP", "TEMPORAL_DO", "ITEM_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# STEP 2: ASSUMPTION CHECKING -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: ASSUMPTION CHECKING\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 2.1 Check for missing values -cat("\n2.1 Missing Values Check:\n") -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("Missing values by TIME and DOMAIN:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\n2.2 Outlier Detection:\n") -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\n2.3 Normality Tests:\n") -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\n2.4 Homogeneity of Variance Tests:\n") - -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance across TIME within each DOMAIN:\n") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance across DOMAIN within each TIME:\n") -print(homogeneity_domain) - -# ============================================================================= -# STEP 3: DESCRIPTIVE STATISTICS -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# Overall descriptive statistics -desc_stats <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE), 5), - sd = round(sd(MEAN_DIFFERENCE), 5), - median = round(median(MEAN_DIFFERENCE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75), 5), - min = round(min(MEAN_DIFFERENCE), 5), - max = round(max(MEAN_DIFFERENCE), 5), - .groups = 'drop' - ) - -cat("Descriptive statistics by TIME and DOMAIN:\n") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_group <- long_data_clean %>% - group_by(GROUP, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE), 5), - sd = round(sd(MEAN_DIFFERENCE), 5), - .groups = 'drop' - ) - -cat("\nDescriptive statistics by GROUP, TIME, and DOMAIN:\n") -print(desc_stats_by_group) - -# ============================================================================= -# STEP 4: MIXED ANOVA ANALYSES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Main Mixed ANOVA -cat("\n4.1 Main Mixed ANOVA:\n") -cat("Within-subjects factors: TIME, DOMAIN\n") -cat("Between-subjects factors: GROUP, TEMPORAL_DO, ITEM_DO\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - main_anova <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = c(TIME, DOMAIN), - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Main ANOVA Results:\n") - print(main_anova) - - # Check sphericity - if (!is.null(main_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(main_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in main ANOVA:", e$message, "\n") - - # Try simpler model without all between-subjects factors - cat("Attempting simpler model with only GROUP as between-subjects factor...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = c(TIME, DOMAIN), - between = GROUP, - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Simplified ANOVA Results:\n") - print(simple_anova) - - main_anova <<- simple_anova - - }, error = function(e2) { - cat("Simplified ANOVA also failed:", e2$message, "\n") - }) -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain in levels(long_data_clean$DOMAIN)) { - cat("\nAnalyzing domain:", domain, "\n") - - domain_data <- long_data_clean[long_data_clean$DOMAIN == domain, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = TIME, - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain, ":\n") - print(domain_anova) - - domain_results[[domain]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = TIME, - between = GROUP, - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Time-specific analyses -cat("\n4.3 Time-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -time_results <- list() - -for (time in levels(long_data_clean$TIME)) { - cat("\nAnalyzing time:", time, "\n") - - time_data <- long_data_clean[long_data_clean$TIME == time, ] - - tryCatch({ - time_anova <- ezANOVA( - data = time_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = DOMAIN, - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", time, ":\n") - print(time_anova) - - time_results[[time]] <- time_anova - - }, error = function(e) { - cat("Error in ANOVA for", time, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = time_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = DOMAIN, - between = GROUP, - type = 3, - detailed = TRUE - ) - print(simple_anova) - time_results[[time]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# ============================================================================= -# STEP 5: POST-HOC ANALYSES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 5: POST-HOC ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 5.1 Pairwise comparisons for significant effects -if (exists("main_anova") && !is.null(main_anova)) { - cat("\n5.1 Post-hoc comparisons for main effects:\n") - - # Check for significant main effects and interactions - anova_table <- main_anova$ANOVA - - if ("TIME" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "TIME"] < 0.05) { - cat("Significant TIME main effect found. Computing pairwise comparisons...\n") - - # Simple paired t-tests for TIME effect - past_means <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - future_means <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - if (length(past_means) == length(future_means)) { - time_t_test <- t.test(past_means, future_means, paired = TRUE) - cat("Paired t-test for TIME effect:\n") - cat("t =", round(time_t_test$statistic, 5), - ", df =", time_t_test$parameter, - ", p =", round(time_t_test$p.value, 5), "\n") - cat("Mean difference (Past - Future):", round(time_t_test$estimate, 5), "\n") - } - } - - if ("DOMAIN" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "DOMAIN"] < 0.05) { - cat("Significant DOMAIN main effect found.\n") - - # Pairwise comparisons between domains - domain_means <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise(mean_diff = mean(MEAN_DIFFERENCE), .groups = 'drop') - - cat("Domain means:\n") - print(domain_means) - } - - if ("TIME:DOMAIN" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "TIME:DOMAIN"] < 0.05) { - cat("Significant TIME × DOMAIN interaction found.\n") - - # Simple effects analysis - interaction_means <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise(mean_diff = mean(MEAN_DIFFERENCE), .groups = 'drop') - - cat("TIME × DOMAIN interaction means:\n") - print(interaction_means) - } -} - -# ============================================================================= -# STEP 6: EFFECT SIZES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 6: EFFECT SIZES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -if (exists("main_anova") && !is.null(main_anova)) { - anova_table <- main_anova$ANOVA - - # Calculate partial eta squared for each effect - anova_table$partial_eta_squared <- round(anova_table$SSn / (anova_table$SSn + anova_table$SSd), 5) - - cat("Effect sizes (partial eta squared):\n") - effect_sizes <- anova_table[, c("Effect", "partial_eta_squared")] - print(effect_sizes) -} - -# ============================================================================= -# STEP 7: SUMMARY AND INTERPRETATION -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 7: SUMMARY AND INTERPRETATION\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -cat("Analysis Summary:\n") -cat("- Total participants:", length(unique(long_data_clean$pID)), "\n") -cat("- Total observations:", nrow(long_data_clean), "\n") -cat("- Within-subjects factors: TIME (Past vs Future), DOMAIN (Preferences, Personality, Values, Life)\n") -cat("- Between-subjects factors: GROUP, TEMPORAL_DO, ITEM_DO\n") -cat("- Dependent variable: Mean absolute differences in domain ratings\n") - -cat("\nResearch Question:\n") -cat("Do participants rate changes in domains differently from past to now vs past to future?\n") - -if (exists("main_anova") && !is.null(main_anova)) { - anova_table <- main_anova$ANOVA - - cat("\nKey Findings:\n") - - # Check for significant effects - significant_effects <- anova_table$Effect[anova_table$p < 0.05] - - if (length(significant_effects) > 0) { - cat("Significant effects found:\n") - for (effect in significant_effects) { - p_val <- anova_table$p[anova_table$Effect == effect] - cat("-", effect, "(p =", round(p_val, 5), ")\n") - } - } else { - cat("No significant effects found at α = 0.05\n") - } - - # Interpret TIME effect - if ("TIME" %in% anova_table$Effect) { - time_p <- anova_table$p[anova_table$Effect == "TIME"] - if (time_p < 0.05) { - cat("\nTIME Effect: Participants show different levels of change when comparing\n") - cat("past-to-now vs past-to-future perspectives (p =", round(time_p, 5), ")\n") - } else { - cat("\nTIME Effect: No significant difference between past-to-now and past-to-future\n") - cat("perspectives (p =", round(time_p, 5), ")\n") - } - } - - # Interpret DOMAIN effect - if ("DOMAIN" %in% anova_table$Effect) { - domain_p <- anova_table$p[anova_table$Effect == "DOMAIN"] - if (domain_p < 0.05) { - cat("\nDOMAIN Effect: Different domains show different levels of perceived change\n") - cat("(p =", round(domain_p, 5), ")\n") - } else { - cat("\nDOMAIN Effect: No significant differences between domains in perceived change\n") - cat("(p =", round(domain_p, 5), ")\n") - } - } - - # Interpret interaction - if ("TIME:DOMAIN" %in% anova_table$Effect) { - interaction_p <- anova_table$p[anova_table$Effect == "TIME:DOMAIN"] - if (interaction_p < 0.05) { - cat("\nTIME × DOMAIN Interaction: The effect of time perspective on perceived change\n") - cat("varies across domains (p =", round(interaction_p, 5), ")\n") - } else { - cat("\nTIME × DOMAIN Interaction: No significant interaction between time perspective\n") - cat("and domain (p =", round(interaction_p, 5), ")\n") - } - } -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") diff --git a/.history/mixed anova - domain means_20250912124317.r b/.history/mixed anova - domain means_20250912124317.r deleted file mode 100644 index 141675c..0000000 --- a/.history/mixed anova - domain means_20250912124317.r +++ /dev/null @@ -1,572 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -cat("\nChecking available domain mean columns:\n") -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - cat("Warning: Missing variables:", paste(missing_vars, collapse = ", "), "\n") -} else { - cat("All required domain mean variables found!\n") -} - -# ============================================================================= -# STEP 1: DATA PIVOTING TO LONG FORMAT -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 1: DATA PIVOTING TO LONG FORMAT\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -cat("Domain mapping:\n") -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - cat("Warning: Variable", var_name, "not found in data\n") - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "GROUP", "TEMPORAL_DO", "ITEM_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$GROUP <- as.factor(long_data$GROUP) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - long_data$ITEM_DO <- as.factor(long_data$ITEM_DO) - - return(long_data) -} - -# Pivot data to long format -cat("\nPivoting data to long format...\n") -long_data <- pivot_domain_means(data, domain_mapping) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique participants:", length(unique(long_data$pID)), "\n") -cat("TIME levels:", paste(levels(long_data$TIME), collapse = ", "), "\n") -cat("DOMAIN levels:", paste(levels(long_data$DOMAIN), collapse = ", "), "\n") - -# Display structure and sample -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -# Show example data for one participant -cat("\nExample: Participant 1 across all domains and times:\n") -participant_1_data <- long_data[long_data$pID == 1, c("pID", "GROUP", "TEMPORAL_DO", "ITEM_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# STEP 2: ASSUMPTION CHECKING -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: ASSUMPTION CHECKING\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 2.1 Check for missing values -cat("\n2.1 Missing Values Check:\n") -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("Missing values by TIME and DOMAIN:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\n2.2 Outlier Detection:\n") -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\n2.3 Normality Tests:\n") -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\n2.4 Homogeneity of Variance Tests:\n") - -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance across TIME within each DOMAIN:\n") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance across DOMAIN within each TIME:\n") -print(homogeneity_domain) - -# ============================================================================= -# STEP 3: DESCRIPTIVE STATISTICS -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# Overall descriptive statistics -desc_stats <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE), 5), - sd = round(sd(MEAN_DIFFERENCE), 5), - median = round(median(MEAN_DIFFERENCE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75), 5), - min = round(min(MEAN_DIFFERENCE), 5), - max = round(max(MEAN_DIFFERENCE), 5), - .groups = 'drop' - ) - -cat("Descriptive statistics by TIME and DOMAIN:\n") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_group <- long_data_clean %>% - group_by(GROUP, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE), 5), - sd = round(sd(MEAN_DIFFERENCE), 5), - .groups = 'drop' - ) - -cat("\nDescriptive statistics by GROUP, TIME, and DOMAIN:\n") -print(desc_stats_by_group) - -# ============================================================================= -# STEP 4: MIXED ANOVA ANALYSES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Main Mixed ANOVA -cat("\n4.1 Main Mixed ANOVA:\n") -cat("Within-subjects factors: TIME, DOMAIN\n") -cat("Between-subjects factors: GROUP, TEMPORAL_DO, ITEM_DO\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - main_anova <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = c(TIME, DOMAIN), - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Main ANOVA Results:\n") - print(main_anova) - - # Check sphericity - if (!is.null(main_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(main_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in main ANOVA:", e$message, "\n") - - # Try simpler model without all between-subjects factors - cat("Attempting simpler model with only GROUP as between-subjects factor...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = c(TIME, DOMAIN), - between = GROUP, - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Simplified ANOVA Results:\n") - print(simple_anova) - - main_anova <<- simple_anova - - }, error = function(e2) { - cat("Simplified ANOVA also failed:", e2$message, "\n") - }) -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain in levels(long_data_clean$DOMAIN)) { - cat("\nAnalyzing domain:", domain, "\n") - - domain_data <- long_data_clean[long_data_clean$DOMAIN == domain, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = TIME, - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain, ":\n") - print(domain_anova) - - domain_results[[domain]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = TIME, - between = GROUP, - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Time-specific analyses -cat("\n4.3 Time-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -time_results <- list() - -for (time in levels(long_data_clean$TIME)) { - cat("\nAnalyzing time:", time, "\n") - - time_data <- long_data_clean[long_data_clean$TIME == time, ] - - tryCatch({ - time_anova <- ezANOVA( - data = time_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = DOMAIN, - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", time, ":\n") - print(time_anova) - - time_results[[time]] <- time_anova - - }, error = function(e) { - cat("Error in ANOVA for", time, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = time_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = DOMAIN, - between = GROUP, - type = 3, - detailed = TRUE - ) - print(simple_anova) - time_results[[time]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# ============================================================================= -# STEP 5: POST-HOC ANALYSES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 5: POST-HOC ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 5.1 Pairwise comparisons for significant effects -if (exists("main_anova") && !is.null(main_anova)) { - cat("\n5.1 Post-hoc comparisons for main effects:\n") - - # Check for significant main effects and interactions - anova_table <- main_anova$ANOVA - - if ("TIME" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "TIME"] < 0.05) { - cat("Significant TIME main effect found. Computing pairwise comparisons...\n") - - # Simple paired t-tests for TIME effect - past_means <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - future_means <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - if (length(past_means) == length(future_means)) { - time_t_test <- t.test(past_means, future_means, paired = TRUE) - cat("Paired t-test for TIME effect:\n") - cat("t =", round(time_t_test$statistic, 5), - ", df =", time_t_test$parameter, - ", p =", round(time_t_test$p.value, 5), "\n") - cat("Mean difference (Past - Future):", round(time_t_test$estimate, 5), "\n") - } - } - - if ("DOMAIN" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "DOMAIN"] < 0.05) { - cat("Significant DOMAIN main effect found.\n") - - # Pairwise comparisons between domains - domain_means <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise(mean_diff = mean(MEAN_DIFFERENCE), .groups = 'drop') - - cat("Domain means:\n") - print(domain_means) - } - - if ("TIME:DOMAIN" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "TIME:DOMAIN"] < 0.05) { - cat("Significant TIME × DOMAIN interaction found.\n") - - # Simple effects analysis - interaction_means <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise(mean_diff = mean(MEAN_DIFFERENCE), .groups = 'drop') - - cat("TIME × DOMAIN interaction means:\n") - print(interaction_means) - } -} - -# ============================================================================= -# STEP 6: EFFECT SIZES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 6: EFFECT SIZES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -if (exists("main_anova") && !is.null(main_anova)) { - anova_table <- main_anova$ANOVA - - # Calculate partial eta squared for each effect - anova_table$partial_eta_squared <- round(anova_table$SSn / (anova_table$SSn + anova_table$SSd), 5) - - cat("Effect sizes (partial eta squared):\n") - effect_sizes <- anova_table[, c("Effect", "partial_eta_squared")] - print(effect_sizes) -} - -# ============================================================================= -# STEP 7: SUMMARY AND INTERPRETATION -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 7: SUMMARY AND INTERPRETATION\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -cat("Analysis Summary:\n") -cat("- Total participants:", length(unique(long_data_clean$pID)), "\n") -cat("- Total observations:", nrow(long_data_clean), "\n") -cat("- Within-subjects factors: TIME (Past vs Future), DOMAIN (Preferences, Personality, Values, Life)\n") -cat("- Between-subjects factors: GROUP, TEMPORAL_DO, ITEM_DO\n") -cat("- Dependent variable: Mean absolute differences in domain ratings\n") - -cat("\nResearch Question:\n") -cat("Do participants rate changes in domains differently from past to now vs past to future?\n") - -if (exists("main_anova") && !is.null(main_anova)) { - anova_table <- main_anova$ANOVA - - cat("\nKey Findings:\n") - - # Check for significant effects - significant_effects <- anova_table$Effect[anova_table$p < 0.05] - - if (length(significant_effects) > 0) { - cat("Significant effects found:\n") - for (effect in significant_effects) { - p_val <- anova_table$p[anova_table$Effect == effect] - cat("-", effect, "(p =", round(p_val, 5), ")\n") - } - } else { - cat("No significant effects found at α = 0.05\n") - } - - # Interpret TIME effect - if ("TIME" %in% anova_table$Effect) { - time_p <- anova_table$p[anova_table$Effect == "TIME"] - if (time_p < 0.05) { - cat("\nTIME Effect: Participants show different levels of change when comparing\n") - cat("past-to-now vs past-to-future perspectives (p =", round(time_p, 5), ")\n") - } else { - cat("\nTIME Effect: No significant difference between past-to-now and past-to-future\n") - cat("perspectives (p =", round(time_p, 5), ")\n") - } - } - - # Interpret DOMAIN effect - if ("DOMAIN" %in% anova_table$Effect) { - domain_p <- anova_table$p[anova_table$Effect == "DOMAIN"] - if (domain_p < 0.05) { - cat("\nDOMAIN Effect: Different domains show different levels of perceived change\n") - cat("(p =", round(domain_p, 5), ")\n") - } else { - cat("\nDOMAIN Effect: No significant differences between domains in perceived change\n") - cat("(p =", round(domain_p, 5), ")\n") - } - } - - # Interpret interaction - if ("TIME:DOMAIN" %in% anova_table$Effect) { - interaction_p <- anova_table$p[anova_table$Effect == "TIME:DOMAIN"] - if (interaction_p < 0.05) { - cat("\nTIME × DOMAIN Interaction: The effect of time perspective on perceived change\n") - cat("varies across domains (p =", round(interaction_p, 5), ")\n") - } else { - cat("\nTIME × DOMAIN Interaction: No significant interaction between time perspective\n") - cat("and domain (p =", round(interaction_p, 5), ")\n") - } - } -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") diff --git a/.history/mixed anova - domain means_20250912124407.r b/.history/mixed anova - domain means_20250912124407.r deleted file mode 100644 index 141675c..0000000 --- a/.history/mixed anova - domain means_20250912124407.r +++ /dev/null @@ -1,572 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -cat("\nChecking available domain mean columns:\n") -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - cat("Warning: Missing variables:", paste(missing_vars, collapse = ", "), "\n") -} else { - cat("All required domain mean variables found!\n") -} - -# ============================================================================= -# STEP 1: DATA PIVOTING TO LONG FORMAT -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 1: DATA PIVOTING TO LONG FORMAT\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -cat("Domain mapping:\n") -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - cat("Warning: Variable", var_name, "not found in data\n") - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "GROUP", "TEMPORAL_DO", "ITEM_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$GROUP <- as.factor(long_data$GROUP) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - long_data$ITEM_DO <- as.factor(long_data$ITEM_DO) - - return(long_data) -} - -# Pivot data to long format -cat("\nPivoting data to long format...\n") -long_data <- pivot_domain_means(data, domain_mapping) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique participants:", length(unique(long_data$pID)), "\n") -cat("TIME levels:", paste(levels(long_data$TIME), collapse = ", "), "\n") -cat("DOMAIN levels:", paste(levels(long_data$DOMAIN), collapse = ", "), "\n") - -# Display structure and sample -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -# Show example data for one participant -cat("\nExample: Participant 1 across all domains and times:\n") -participant_1_data <- long_data[long_data$pID == 1, c("pID", "GROUP", "TEMPORAL_DO", "ITEM_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# STEP 2: ASSUMPTION CHECKING -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: ASSUMPTION CHECKING\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 2.1 Check for missing values -cat("\n2.1 Missing Values Check:\n") -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("Missing values by TIME and DOMAIN:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\n2.2 Outlier Detection:\n") -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\n2.3 Normality Tests:\n") -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\n2.4 Homogeneity of Variance Tests:\n") - -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance across TIME within each DOMAIN:\n") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance across DOMAIN within each TIME:\n") -print(homogeneity_domain) - -# ============================================================================= -# STEP 3: DESCRIPTIVE STATISTICS -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# Overall descriptive statistics -desc_stats <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE), 5), - sd = round(sd(MEAN_DIFFERENCE), 5), - median = round(median(MEAN_DIFFERENCE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75), 5), - min = round(min(MEAN_DIFFERENCE), 5), - max = round(max(MEAN_DIFFERENCE), 5), - .groups = 'drop' - ) - -cat("Descriptive statistics by TIME and DOMAIN:\n") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_group <- long_data_clean %>% - group_by(GROUP, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE), 5), - sd = round(sd(MEAN_DIFFERENCE), 5), - .groups = 'drop' - ) - -cat("\nDescriptive statistics by GROUP, TIME, and DOMAIN:\n") -print(desc_stats_by_group) - -# ============================================================================= -# STEP 4: MIXED ANOVA ANALYSES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Main Mixed ANOVA -cat("\n4.1 Main Mixed ANOVA:\n") -cat("Within-subjects factors: TIME, DOMAIN\n") -cat("Between-subjects factors: GROUP, TEMPORAL_DO, ITEM_DO\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - main_anova <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = c(TIME, DOMAIN), - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Main ANOVA Results:\n") - print(main_anova) - - # Check sphericity - if (!is.null(main_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(main_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in main ANOVA:", e$message, "\n") - - # Try simpler model without all between-subjects factors - cat("Attempting simpler model with only GROUP as between-subjects factor...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = c(TIME, DOMAIN), - between = GROUP, - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Simplified ANOVA Results:\n") - print(simple_anova) - - main_anova <<- simple_anova - - }, error = function(e2) { - cat("Simplified ANOVA also failed:", e2$message, "\n") - }) -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain in levels(long_data_clean$DOMAIN)) { - cat("\nAnalyzing domain:", domain, "\n") - - domain_data <- long_data_clean[long_data_clean$DOMAIN == domain, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = TIME, - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain, ":\n") - print(domain_anova) - - domain_results[[domain]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = TIME, - between = GROUP, - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Time-specific analyses -cat("\n4.3 Time-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -time_results <- list() - -for (time in levels(long_data_clean$TIME)) { - cat("\nAnalyzing time:", time, "\n") - - time_data <- long_data_clean[long_data_clean$TIME == time, ] - - tryCatch({ - time_anova <- ezANOVA( - data = time_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = DOMAIN, - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", time, ":\n") - print(time_anova) - - time_results[[time]] <- time_anova - - }, error = function(e) { - cat("Error in ANOVA for", time, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = time_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = DOMAIN, - between = GROUP, - type = 3, - detailed = TRUE - ) - print(simple_anova) - time_results[[time]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# ============================================================================= -# STEP 5: POST-HOC ANALYSES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 5: POST-HOC ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 5.1 Pairwise comparisons for significant effects -if (exists("main_anova") && !is.null(main_anova)) { - cat("\n5.1 Post-hoc comparisons for main effects:\n") - - # Check for significant main effects and interactions - anova_table <- main_anova$ANOVA - - if ("TIME" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "TIME"] < 0.05) { - cat("Significant TIME main effect found. Computing pairwise comparisons...\n") - - # Simple paired t-tests for TIME effect - past_means <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - future_means <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - if (length(past_means) == length(future_means)) { - time_t_test <- t.test(past_means, future_means, paired = TRUE) - cat("Paired t-test for TIME effect:\n") - cat("t =", round(time_t_test$statistic, 5), - ", df =", time_t_test$parameter, - ", p =", round(time_t_test$p.value, 5), "\n") - cat("Mean difference (Past - Future):", round(time_t_test$estimate, 5), "\n") - } - } - - if ("DOMAIN" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "DOMAIN"] < 0.05) { - cat("Significant DOMAIN main effect found.\n") - - # Pairwise comparisons between domains - domain_means <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise(mean_diff = mean(MEAN_DIFFERENCE), .groups = 'drop') - - cat("Domain means:\n") - print(domain_means) - } - - if ("TIME:DOMAIN" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "TIME:DOMAIN"] < 0.05) { - cat("Significant TIME × DOMAIN interaction found.\n") - - # Simple effects analysis - interaction_means <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise(mean_diff = mean(MEAN_DIFFERENCE), .groups = 'drop') - - cat("TIME × DOMAIN interaction means:\n") - print(interaction_means) - } -} - -# ============================================================================= -# STEP 6: EFFECT SIZES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 6: EFFECT SIZES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -if (exists("main_anova") && !is.null(main_anova)) { - anova_table <- main_anova$ANOVA - - # Calculate partial eta squared for each effect - anova_table$partial_eta_squared <- round(anova_table$SSn / (anova_table$SSn + anova_table$SSd), 5) - - cat("Effect sizes (partial eta squared):\n") - effect_sizes <- anova_table[, c("Effect", "partial_eta_squared")] - print(effect_sizes) -} - -# ============================================================================= -# STEP 7: SUMMARY AND INTERPRETATION -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 7: SUMMARY AND INTERPRETATION\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -cat("Analysis Summary:\n") -cat("- Total participants:", length(unique(long_data_clean$pID)), "\n") -cat("- Total observations:", nrow(long_data_clean), "\n") -cat("- Within-subjects factors: TIME (Past vs Future), DOMAIN (Preferences, Personality, Values, Life)\n") -cat("- Between-subjects factors: GROUP, TEMPORAL_DO, ITEM_DO\n") -cat("- Dependent variable: Mean absolute differences in domain ratings\n") - -cat("\nResearch Question:\n") -cat("Do participants rate changes in domains differently from past to now vs past to future?\n") - -if (exists("main_anova") && !is.null(main_anova)) { - anova_table <- main_anova$ANOVA - - cat("\nKey Findings:\n") - - # Check for significant effects - significant_effects <- anova_table$Effect[anova_table$p < 0.05] - - if (length(significant_effects) > 0) { - cat("Significant effects found:\n") - for (effect in significant_effects) { - p_val <- anova_table$p[anova_table$Effect == effect] - cat("-", effect, "(p =", round(p_val, 5), ")\n") - } - } else { - cat("No significant effects found at α = 0.05\n") - } - - # Interpret TIME effect - if ("TIME" %in% anova_table$Effect) { - time_p <- anova_table$p[anova_table$Effect == "TIME"] - if (time_p < 0.05) { - cat("\nTIME Effect: Participants show different levels of change when comparing\n") - cat("past-to-now vs past-to-future perspectives (p =", round(time_p, 5), ")\n") - } else { - cat("\nTIME Effect: No significant difference between past-to-now and past-to-future\n") - cat("perspectives (p =", round(time_p, 5), ")\n") - } - } - - # Interpret DOMAIN effect - if ("DOMAIN" %in% anova_table$Effect) { - domain_p <- anova_table$p[anova_table$Effect == "DOMAIN"] - if (domain_p < 0.05) { - cat("\nDOMAIN Effect: Different domains show different levels of perceived change\n") - cat("(p =", round(domain_p, 5), ")\n") - } else { - cat("\nDOMAIN Effect: No significant differences between domains in perceived change\n") - cat("(p =", round(domain_p, 5), ")\n") - } - } - - # Interpret interaction - if ("TIME:DOMAIN" %in% anova_table$Effect) { - interaction_p <- anova_table$p[anova_table$Effect == "TIME:DOMAIN"] - if (interaction_p < 0.05) { - cat("\nTIME × DOMAIN Interaction: The effect of time perspective on perceived change\n") - cat("varies across domains (p =", round(interaction_p, 5), ")\n") - } else { - cat("\nTIME × DOMAIN Interaction: No significant interaction between time perspective\n") - cat("and domain (p =", round(interaction_p, 5), ")\n") - } - } -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") diff --git a/.history/mixed anova - domain means_20250912124620.r b/.history/mixed anova - domain means_20250912124620.r deleted file mode 100644 index 46ff722..0000000 --- a/.history/mixed anova - domain means_20250912124620.r +++ /dev/null @@ -1,571 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -cat("\nChecking available domain mean columns:\n") -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - cat("Warning: Missing variables:", paste(missing_vars, collapse = ", "), "\n") -} else { - cat("All required domain mean variables found!\n") -} - -# ============================================================================= -# STEP 1: DATA PIVOTING TO LONG FORMAT -# ============================================================================= - -cat("STEP 1: DATA PIVOTING TO LONG FORMAT\n") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -cat("Domain mapping:\n") -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - cat("Warning: Variable", var_name, "not found in data\n") - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "GROUP", "TEMPORAL_DO", "ITEM_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$GROUP <- as.factor(long_data$GROUP) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - long_data$ITEM_DO <- as.factor(long_data$ITEM_DO) - - return(long_data) -} - -# Pivot data to long format -cat("\nPivoting data to long format...\n") -long_data <- pivot_domain_means(data, domain_mapping) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique participants:", length(unique(long_data$pID)), "\n") -cat("TIME levels:", paste(levels(long_data$TIME), collapse = ", "), "\n") -cat("DOMAIN levels:", paste(levels(long_data$DOMAIN), collapse = ", "), "\n") -head(long_data) - -# Display structure and sample -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -# Show example data for one participant -cat("\nExample: Participant 1 across all domains and times:\n") -participant_1_data <- long_data[long_data$pID == 1, c("pID", "GROUP", "TEMPORAL_DO", "ITEM_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# STEP 2: ASSUMPTION CHECKING -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: ASSUMPTION CHECKING\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 2.1 Check for missing values -cat("\n2.1 Missing Values Check:\n") -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("Missing values by TIME and DOMAIN:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\n2.2 Outlier Detection:\n") -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\n2.3 Normality Tests:\n") -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\n2.4 Homogeneity of Variance Tests:\n") - -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance across TIME within each DOMAIN:\n") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance across DOMAIN within each TIME:\n") -print(homogeneity_domain) - -# ============================================================================= -# STEP 3: DESCRIPTIVE STATISTICS -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# Overall descriptive statistics -desc_stats <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE), 5), - sd = round(sd(MEAN_DIFFERENCE), 5), - median = round(median(MEAN_DIFFERENCE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75), 5), - min = round(min(MEAN_DIFFERENCE), 5), - max = round(max(MEAN_DIFFERENCE), 5), - .groups = 'drop' - ) - -cat("Descriptive statistics by TIME and DOMAIN:\n") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_group <- long_data_clean %>% - group_by(GROUP, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE), 5), - sd = round(sd(MEAN_DIFFERENCE), 5), - .groups = 'drop' - ) - -cat("\nDescriptive statistics by GROUP, TIME, and DOMAIN:\n") -print(desc_stats_by_group) - -# ============================================================================= -# STEP 4: MIXED ANOVA ANALYSES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Main Mixed ANOVA -cat("\n4.1 Main Mixed ANOVA:\n") -cat("Within-subjects factors: TIME, DOMAIN\n") -cat("Between-subjects factors: GROUP, TEMPORAL_DO, ITEM_DO\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - main_anova <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = c(TIME, DOMAIN), - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Main ANOVA Results:\n") - print(main_anova) - - # Check sphericity - if (!is.null(main_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(main_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in main ANOVA:", e$message, "\n") - - # Try simpler model without all between-subjects factors - cat("Attempting simpler model with only GROUP as between-subjects factor...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = c(TIME, DOMAIN), - between = GROUP, - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Simplified ANOVA Results:\n") - print(simple_anova) - - main_anova <<- simple_anova - - }, error = function(e2) { - cat("Simplified ANOVA also failed:", e2$message, "\n") - }) -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain in levels(long_data_clean$DOMAIN)) { - cat("\nAnalyzing domain:", domain, "\n") - - domain_data <- long_data_clean[long_data_clean$DOMAIN == domain, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = TIME, - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain, ":\n") - print(domain_anova) - - domain_results[[domain]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = TIME, - between = GROUP, - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Time-specific analyses -cat("\n4.3 Time-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -time_results <- list() - -for (time in levels(long_data_clean$TIME)) { - cat("\nAnalyzing time:", time, "\n") - - time_data <- long_data_clean[long_data_clean$TIME == time, ] - - tryCatch({ - time_anova <- ezANOVA( - data = time_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = DOMAIN, - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", time, ":\n") - print(time_anova) - - time_results[[time]] <- time_anova - - }, error = function(e) { - cat("Error in ANOVA for", time, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = time_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = DOMAIN, - between = GROUP, - type = 3, - detailed = TRUE - ) - print(simple_anova) - time_results[[time]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# ============================================================================= -# STEP 5: POST-HOC ANALYSES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 5: POST-HOC ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 5.1 Pairwise comparisons for significant effects -if (exists("main_anova") && !is.null(main_anova)) { - cat("\n5.1 Post-hoc comparisons for main effects:\n") - - # Check for significant main effects and interactions - anova_table <- main_anova$ANOVA - - if ("TIME" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "TIME"] < 0.05) { - cat("Significant TIME main effect found. Computing pairwise comparisons...\n") - - # Simple paired t-tests for TIME effect - past_means <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - future_means <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - if (length(past_means) == length(future_means)) { - time_t_test <- t.test(past_means, future_means, paired = TRUE) - cat("Paired t-test for TIME effect:\n") - cat("t =", round(time_t_test$statistic, 5), - ", df =", time_t_test$parameter, - ", p =", round(time_t_test$p.value, 5), "\n") - cat("Mean difference (Past - Future):", round(time_t_test$estimate, 5), "\n") - } - } - - if ("DOMAIN" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "DOMAIN"] < 0.05) { - cat("Significant DOMAIN main effect found.\n") - - # Pairwise comparisons between domains - domain_means <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise(mean_diff = mean(MEAN_DIFFERENCE), .groups = 'drop') - - cat("Domain means:\n") - print(domain_means) - } - - if ("TIME:DOMAIN" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "TIME:DOMAIN"] < 0.05) { - cat("Significant TIME × DOMAIN interaction found.\n") - - # Simple effects analysis - interaction_means <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise(mean_diff = mean(MEAN_DIFFERENCE), .groups = 'drop') - - cat("TIME × DOMAIN interaction means:\n") - print(interaction_means) - } -} - -# ============================================================================= -# STEP 6: EFFECT SIZES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 6: EFFECT SIZES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -if (exists("main_anova") && !is.null(main_anova)) { - anova_table <- main_anova$ANOVA - - # Calculate partial eta squared for each effect - anova_table$partial_eta_squared <- round(anova_table$SSn / (anova_table$SSn + anova_table$SSd), 5) - - cat("Effect sizes (partial eta squared):\n") - effect_sizes <- anova_table[, c("Effect", "partial_eta_squared")] - print(effect_sizes) -} - -# ============================================================================= -# STEP 7: SUMMARY AND INTERPRETATION -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 7: SUMMARY AND INTERPRETATION\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -cat("Analysis Summary:\n") -cat("- Total participants:", length(unique(long_data_clean$pID)), "\n") -cat("- Total observations:", nrow(long_data_clean), "\n") -cat("- Within-subjects factors: TIME (Past vs Future), DOMAIN (Preferences, Personality, Values, Life)\n") -cat("- Between-subjects factors: GROUP, TEMPORAL_DO, ITEM_DO\n") -cat("- Dependent variable: Mean absolute differences in domain ratings\n") - -cat("\nResearch Question:\n") -cat("Do participants rate changes in domains differently from past to now vs past to future?\n") - -if (exists("main_anova") && !is.null(main_anova)) { - anova_table <- main_anova$ANOVA - - cat("\nKey Findings:\n") - - # Check for significant effects - significant_effects <- anova_table$Effect[anova_table$p < 0.05] - - if (length(significant_effects) > 0) { - cat("Significant effects found:\n") - for (effect in significant_effects) { - p_val <- anova_table$p[anova_table$Effect == effect] - cat("-", effect, "(p =", round(p_val, 5), ")\n") - } - } else { - cat("No significant effects found at α = 0.05\n") - } - - # Interpret TIME effect - if ("TIME" %in% anova_table$Effect) { - time_p <- anova_table$p[anova_table$Effect == "TIME"] - if (time_p < 0.05) { - cat("\nTIME Effect: Participants show different levels of change when comparing\n") - cat("past-to-now vs past-to-future perspectives (p =", round(time_p, 5), ")\n") - } else { - cat("\nTIME Effect: No significant difference between past-to-now and past-to-future\n") - cat("perspectives (p =", round(time_p, 5), ")\n") - } - } - - # Interpret DOMAIN effect - if ("DOMAIN" %in% anova_table$Effect) { - domain_p <- anova_table$p[anova_table$Effect == "DOMAIN"] - if (domain_p < 0.05) { - cat("\nDOMAIN Effect: Different domains show different levels of perceived change\n") - cat("(p =", round(domain_p, 5), ")\n") - } else { - cat("\nDOMAIN Effect: No significant differences between domains in perceived change\n") - cat("(p =", round(domain_p, 5), ")\n") - } - } - - # Interpret interaction - if ("TIME:DOMAIN" %in% anova_table$Effect) { - interaction_p <- anova_table$p[anova_table$Effect == "TIME:DOMAIN"] - if (interaction_p < 0.05) { - cat("\nTIME × DOMAIN Interaction: The effect of time perspective on perceived change\n") - cat("varies across domains (p =", round(interaction_p, 5), ")\n") - } else { - cat("\nTIME × DOMAIN Interaction: No significant interaction between time perspective\n") - cat("and domain (p =", round(interaction_p, 5), ")\n") - } - } -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") diff --git a/.history/mixed anova - domain means_20250912125000.r b/.history/mixed anova - domain means_20250912125000.r deleted file mode 100644 index fd6867b..0000000 --- a/.history/mixed anova - domain means_20250912125000.r +++ /dev/null @@ -1,567 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -cat("\nChecking available domain mean columns:\n") -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - cat("Warning: Missing variables:", paste(missing_vars, collapse = ", "), "\n") -} else { - cat("All required domain mean variables found!\n") -} - -# ============================================================================= -# STEP 1: DATA PIVOTING TO LONG FORMAT -# ============================================================================= - -cat("STEP 1: DATA PIVOTING TO LONG FORMAT\n") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -cat("Domain mapping:\n") -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - cat("Warning: Variable", var_name, "not found in data\n") - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "GROUP", "TEMPORAL_DO", "ITEM_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$GROUP <- as.factor(long_data$GROUP) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - long_data$ITEM_DO <- as.factor(long_data$ITEM_DO) - - return(long_data) -} - -# Pivot data to long format -cat("\nPivoting data to long format...\n") -long_data <- pivot_domain_means(data, domain_mapping) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique participants:", length(unique(long_data$pID)), "\n") -cat("TIME levels:", paste(levels(long_data$TIME), collapse = ", "), "\n") -cat("DOMAIN levels:", paste(levels(long_data$DOMAIN), collapse = ", "), "\n") - -# Display structure and sample -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -# Show example data for one participant -cat("\nExample: Participant 1 across all domains and times:\n") -participant_1_data <- long_data[long_data$pID == 1, c("pID", "GROUP", "TEMPORAL_DO", "ITEM_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# STEP 2: ASSUMPTION CHECKING -# ============================================================================= - - -# 2.1 Check for missing values -cat("\n2.1 Missing Values Check:\n") -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("Missing values by TIME and DOMAIN:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\n2.2 Outlier Detection:\n") -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\n2.3 Normality Tests:\n") -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\n2.4 Homogeneity of Variance Tests:\n") - -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance across TIME within each DOMAIN:\n") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance across DOMAIN within each TIME:\n") -print(homogeneity_domain) - -# ============================================================================= -# STEP 3: DESCRIPTIVE STATISTICS -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# Overall descriptive statistics -desc_stats <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE), 5), - sd = round(sd(MEAN_DIFFERENCE), 5), - median = round(median(MEAN_DIFFERENCE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75), 5), - min = round(min(MEAN_DIFFERENCE), 5), - max = round(max(MEAN_DIFFERENCE), 5), - .groups = 'drop' - ) - -cat("Descriptive statistics by TIME and DOMAIN:\n") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_group <- long_data_clean %>% - group_by(GROUP, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE), 5), - sd = round(sd(MEAN_DIFFERENCE), 5), - .groups = 'drop' - ) - -cat("\nDescriptive statistics by GROUP, TIME, and DOMAIN:\n") -print(desc_stats_by_group) - -# ============================================================================= -# STEP 4: MIXED ANOVA ANALYSES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Main Mixed ANOVA -cat("\n4.1 Main Mixed ANOVA:\n") -cat("Within-subjects factors: TIME, DOMAIN\n") -cat("Between-subjects factors: GROUP, TEMPORAL_DO, ITEM_DO\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - main_anova <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = c(TIME, DOMAIN), - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Main ANOVA Results:\n") - print(main_anova) - - # Check sphericity - if (!is.null(main_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(main_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in main ANOVA:", e$message, "\n") - - # Try simpler model without all between-subjects factors - cat("Attempting simpler model with only GROUP as between-subjects factor...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = c(TIME, DOMAIN), - between = GROUP, - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Simplified ANOVA Results:\n") - print(simple_anova) - - main_anova <<- simple_anova - - }, error = function(e2) { - cat("Simplified ANOVA also failed:", e2$message, "\n") - }) -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain in levels(long_data_clean$DOMAIN)) { - cat("\nAnalyzing domain:", domain, "\n") - - domain_data <- long_data_clean[long_data_clean$DOMAIN == domain, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = TIME, - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain, ":\n") - print(domain_anova) - - domain_results[[domain]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = TIME, - between = GROUP, - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Time-specific analyses -cat("\n4.3 Time-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -time_results <- list() - -for (time in levels(long_data_clean$TIME)) { - cat("\nAnalyzing time:", time, "\n") - - time_data <- long_data_clean[long_data_clean$TIME == time, ] - - tryCatch({ - time_anova <- ezANOVA( - data = time_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = DOMAIN, - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", time, ":\n") - print(time_anova) - - time_results[[time]] <- time_anova - - }, error = function(e) { - cat("Error in ANOVA for", time, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = time_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = DOMAIN, - between = GROUP, - type = 3, - detailed = TRUE - ) - print(simple_anova) - time_results[[time]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# ============================================================================= -# STEP 5: POST-HOC ANALYSES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 5: POST-HOC ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 5.1 Pairwise comparisons for significant effects -if (exists("main_anova") && !is.null(main_anova)) { - cat("\n5.1 Post-hoc comparisons for main effects:\n") - - # Check for significant main effects and interactions - anova_table <- main_anova$ANOVA - - if ("TIME" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "TIME"] < 0.05) { - cat("Significant TIME main effect found. Computing pairwise comparisons...\n") - - # Simple paired t-tests for TIME effect - past_means <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - future_means <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - if (length(past_means) == length(future_means)) { - time_t_test <- t.test(past_means, future_means, paired = TRUE) - cat("Paired t-test for TIME effect:\n") - cat("t =", round(time_t_test$statistic, 5), - ", df =", time_t_test$parameter, - ", p =", round(time_t_test$p.value, 5), "\n") - cat("Mean difference (Past - Future):", round(time_t_test$estimate, 5), "\n") - } - } - - if ("DOMAIN" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "DOMAIN"] < 0.05) { - cat("Significant DOMAIN main effect found.\n") - - # Pairwise comparisons between domains - domain_means <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise(mean_diff = mean(MEAN_DIFFERENCE), .groups = 'drop') - - cat("Domain means:\n") - print(domain_means) - } - - if ("TIME:DOMAIN" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "TIME:DOMAIN"] < 0.05) { - cat("Significant TIME × DOMAIN interaction found.\n") - - # Simple effects analysis - interaction_means <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise(mean_diff = mean(MEAN_DIFFERENCE), .groups = 'drop') - - cat("TIME × DOMAIN interaction means:\n") - print(interaction_means) - } -} - -# ============================================================================= -# STEP 6: EFFECT SIZES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 6: EFFECT SIZES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -if (exists("main_anova") && !is.null(main_anova)) { - anova_table <- main_anova$ANOVA - - # Calculate partial eta squared for each effect - anova_table$partial_eta_squared <- round(anova_table$SSn / (anova_table$SSn + anova_table$SSd), 5) - - cat("Effect sizes (partial eta squared):\n") - effect_sizes <- anova_table[, c("Effect", "partial_eta_squared")] - print(effect_sizes) -} - -# ============================================================================= -# STEP 7: SUMMARY AND INTERPRETATION -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 7: SUMMARY AND INTERPRETATION\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -cat("Analysis Summary:\n") -cat("- Total participants:", length(unique(long_data_clean$pID)), "\n") -cat("- Total observations:", nrow(long_data_clean), "\n") -cat("- Within-subjects factors: TIME (Past vs Future), DOMAIN (Preferences, Personality, Values, Life)\n") -cat("- Between-subjects factors: GROUP, TEMPORAL_DO, ITEM_DO\n") -cat("- Dependent variable: Mean absolute differences in domain ratings\n") - -cat("\nResearch Question:\n") -cat("Do participants rate changes in domains differently from past to now vs past to future?\n") - -if (exists("main_anova") && !is.null(main_anova)) { - anova_table <- main_anova$ANOVA - - cat("\nKey Findings:\n") - - # Check for significant effects - significant_effects <- anova_table$Effect[anova_table$p < 0.05] - - if (length(significant_effects) > 0) { - cat("Significant effects found:\n") - for (effect in significant_effects) { - p_val <- anova_table$p[anova_table$Effect == effect] - cat("-", effect, "(p =", round(p_val, 5), ")\n") - } - } else { - cat("No significant effects found at α = 0.05\n") - } - - # Interpret TIME effect - if ("TIME" %in% anova_table$Effect) { - time_p <- anova_table$p[anova_table$Effect == "TIME"] - if (time_p < 0.05) { - cat("\nTIME Effect: Participants show different levels of change when comparing\n") - cat("past-to-now vs past-to-future perspectives (p =", round(time_p, 5), ")\n") - } else { - cat("\nTIME Effect: No significant difference between past-to-now and past-to-future\n") - cat("perspectives (p =", round(time_p, 5), ")\n") - } - } - - # Interpret DOMAIN effect - if ("DOMAIN" %in% anova_table$Effect) { - domain_p <- anova_table$p[anova_table$Effect == "DOMAIN"] - if (domain_p < 0.05) { - cat("\nDOMAIN Effect: Different domains show different levels of perceived change\n") - cat("(p =", round(domain_p, 5), ")\n") - } else { - cat("\nDOMAIN Effect: No significant differences between domains in perceived change\n") - cat("(p =", round(domain_p, 5), ")\n") - } - } - - # Interpret interaction - if ("TIME:DOMAIN" %in% anova_table$Effect) { - interaction_p <- anova_table$p[anova_table$Effect == "TIME:DOMAIN"] - if (interaction_p < 0.05) { - cat("\nTIME × DOMAIN Interaction: The effect of time perspective on perceived change\n") - cat("varies across domains (p =", round(interaction_p, 5), ")\n") - } else { - cat("\nTIME × DOMAIN Interaction: No significant interaction between time perspective\n") - cat("and domain (p =", round(interaction_p, 5), ")\n") - } - } -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") diff --git a/.history/mixed anova - domain means_20250912125003.r b/.history/mixed anova - domain means_20250912125003.r deleted file mode 100644 index 6954522..0000000 --- a/.history/mixed anova - domain means_20250912125003.r +++ /dev/null @@ -1,567 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -cat("\nChecking available domain mean columns:\n") -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - cat("Warning: Missing variables:", paste(missing_vars, collapse = ", "), "\n") -} else { - cat("All required domain mean variables found!\n") -} - -# ============================================================================= -# STEP 1: DATA PIVOTING TO LONG FORMAT -# ============================================================================= - -cat("STEP 1: DATA PIVOTING TO LONG FORMAT\n") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -cat("Domain mapping:\n") -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - cat("Warning: Variable", var_name, "not found in data\n") - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "GROUP", "TEMPORAL_DO", "ITEM_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$GROUP <- as.factor(long_data$GROUP) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - long_data$ITEM_DO <- as.factor(long_data$ITEM_DO) - - return(long_data) -} - -# Pivot data to long format -cat("\nPivoting data to long format...\n") -long_data <- pivot_domain_means(data, domain_mapping) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique participants:", length(unique(long_data$pID)), "\n") -cat("TIME levels:", paste(levels(long_data$TIME), collapse = ", "), "\n") -cat("DOMAIN levels:", paste(levels(long_data$DOMAIN), collapse = ", "), "\n") - -# Display structure and sample -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(utils::head(long_data, 10)) - -# Show example data for one participant -cat("\nExample: Participant 1 across all domains and times:\n") -participant_1_data <- long_data[long_data$pID == 1, c("pID", "GROUP", "TEMPORAL_DO", "ITEM_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# STEP 2: ASSUMPTION CHECKING -# ============================================================================= - - -# 2.1 Check for missing values -cat("\n2.1 Missing Values Check:\n") -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("Missing values by TIME and DOMAIN:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\n2.2 Outlier Detection:\n") -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\n2.3 Normality Tests:\n") -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\n2.4 Homogeneity of Variance Tests:\n") - -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance across TIME within each DOMAIN:\n") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance across DOMAIN within each TIME:\n") -print(homogeneity_domain) - -# ============================================================================= -# STEP 3: DESCRIPTIVE STATISTICS -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# Overall descriptive statistics -desc_stats <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE), 5), - sd = round(sd(MEAN_DIFFERENCE), 5), - median = round(median(MEAN_DIFFERENCE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75), 5), - min = round(min(MEAN_DIFFERENCE), 5), - max = round(max(MEAN_DIFFERENCE), 5), - .groups = 'drop' - ) - -cat("Descriptive statistics by TIME and DOMAIN:\n") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_group <- long_data_clean %>% - group_by(GROUP, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE), 5), - sd = round(sd(MEAN_DIFFERENCE), 5), - .groups = 'drop' - ) - -cat("\nDescriptive statistics by GROUP, TIME, and DOMAIN:\n") -print(desc_stats_by_group) - -# ============================================================================= -# STEP 4: MIXED ANOVA ANALYSES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Main Mixed ANOVA -cat("\n4.1 Main Mixed ANOVA:\n") -cat("Within-subjects factors: TIME, DOMAIN\n") -cat("Between-subjects factors: GROUP, TEMPORAL_DO, ITEM_DO\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - main_anova <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = c(TIME, DOMAIN), - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Main ANOVA Results:\n") - print(main_anova) - - # Check sphericity - if (!is.null(main_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(main_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in main ANOVA:", e$message, "\n") - - # Try simpler model without all between-subjects factors - cat("Attempting simpler model with only GROUP as between-subjects factor...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = c(TIME, DOMAIN), - between = GROUP, - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Simplified ANOVA Results:\n") - print(simple_anova) - - main_anova <<- simple_anova - - }, error = function(e2) { - cat("Simplified ANOVA also failed:", e2$message, "\n") - }) -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain in levels(long_data_clean$DOMAIN)) { - cat("\nAnalyzing domain:", domain, "\n") - - domain_data <- long_data_clean[long_data_clean$DOMAIN == domain, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = TIME, - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain, ":\n") - print(domain_anova) - - domain_results[[domain]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = TIME, - between = GROUP, - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Time-specific analyses -cat("\n4.3 Time-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -time_results <- list() - -for (time in levels(long_data_clean$TIME)) { - cat("\nAnalyzing time:", time, "\n") - - time_data <- long_data_clean[long_data_clean$TIME == time, ] - - tryCatch({ - time_anova <- ezANOVA( - data = time_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = DOMAIN, - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", time, ":\n") - print(time_anova) - - time_results[[time]] <- time_anova - - }, error = function(e) { - cat("Error in ANOVA for", time, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = time_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = DOMAIN, - between = GROUP, - type = 3, - detailed = TRUE - ) - print(simple_anova) - time_results[[time]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# ============================================================================= -# STEP 5: POST-HOC ANALYSES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 5: POST-HOC ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 5.1 Pairwise comparisons for significant effects -if (exists("main_anova") && !is.null(main_anova)) { - cat("\n5.1 Post-hoc comparisons for main effects:\n") - - # Check for significant main effects and interactions - anova_table <- main_anova$ANOVA - - if ("TIME" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "TIME"] < 0.05) { - cat("Significant TIME main effect found. Computing pairwise comparisons...\n") - - # Simple paired t-tests for TIME effect - past_means <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - future_means <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - if (length(past_means) == length(future_means)) { - time_t_test <- t.test(past_means, future_means, paired = TRUE) - cat("Paired t-test for TIME effect:\n") - cat("t =", round(time_t_test$statistic, 5), - ", df =", time_t_test$parameter, - ", p =", round(time_t_test$p.value, 5), "\n") - cat("Mean difference (Past - Future):", round(time_t_test$estimate, 5), "\n") - } - } - - if ("DOMAIN" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "DOMAIN"] < 0.05) { - cat("Significant DOMAIN main effect found.\n") - - # Pairwise comparisons between domains - domain_means <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise(mean_diff = mean(MEAN_DIFFERENCE), .groups = 'drop') - - cat("Domain means:\n") - print(domain_means) - } - - if ("TIME:DOMAIN" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "TIME:DOMAIN"] < 0.05) { - cat("Significant TIME × DOMAIN interaction found.\n") - - # Simple effects analysis - interaction_means <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise(mean_diff = mean(MEAN_DIFFERENCE), .groups = 'drop') - - cat("TIME × DOMAIN interaction means:\n") - print(interaction_means) - } -} - -# ============================================================================= -# STEP 6: EFFECT SIZES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 6: EFFECT SIZES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -if (exists("main_anova") && !is.null(main_anova)) { - anova_table <- main_anova$ANOVA - - # Calculate partial eta squared for each effect - anova_table$partial_eta_squared <- round(anova_table$SSn / (anova_table$SSn + anova_table$SSd), 5) - - cat("Effect sizes (partial eta squared):\n") - effect_sizes <- anova_table[, c("Effect", "partial_eta_squared")] - print(effect_sizes) -} - -# ============================================================================= -# STEP 7: SUMMARY AND INTERPRETATION -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 7: SUMMARY AND INTERPRETATION\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -cat("Analysis Summary:\n") -cat("- Total participants:", length(unique(long_data_clean$pID)), "\n") -cat("- Total observations:", nrow(long_data_clean), "\n") -cat("- Within-subjects factors: TIME (Past vs Future), DOMAIN (Preferences, Personality, Values, Life)\n") -cat("- Between-subjects factors: GROUP, TEMPORAL_DO, ITEM_DO\n") -cat("- Dependent variable: Mean absolute differences in domain ratings\n") - -cat("\nResearch Question:\n") -cat("Do participants rate changes in domains differently from past to now vs past to future?\n") - -if (exists("main_anova") && !is.null(main_anova)) { - anova_table <- main_anova$ANOVA - - cat("\nKey Findings:\n") - - # Check for significant effects - significant_effects <- anova_table$Effect[anova_table$p < 0.05] - - if (length(significant_effects) > 0) { - cat("Significant effects found:\n") - for (effect in significant_effects) { - p_val <- anova_table$p[anova_table$Effect == effect] - cat("-", effect, "(p =", round(p_val, 5), ")\n") - } - } else { - cat("No significant effects found at α = 0.05\n") - } - - # Interpret TIME effect - if ("TIME" %in% anova_table$Effect) { - time_p <- anova_table$p[anova_table$Effect == "TIME"] - if (time_p < 0.05) { - cat("\nTIME Effect: Participants show different levels of change when comparing\n") - cat("past-to-now vs past-to-future perspectives (p =", round(time_p, 5), ")\n") - } else { - cat("\nTIME Effect: No significant difference between past-to-now and past-to-future\n") - cat("perspectives (p =", round(time_p, 5), ")\n") - } - } - - # Interpret DOMAIN effect - if ("DOMAIN" %in% anova_table$Effect) { - domain_p <- anova_table$p[anova_table$Effect == "DOMAIN"] - if (domain_p < 0.05) { - cat("\nDOMAIN Effect: Different domains show different levels of perceived change\n") - cat("(p =", round(domain_p, 5), ")\n") - } else { - cat("\nDOMAIN Effect: No significant differences between domains in perceived change\n") - cat("(p =", round(domain_p, 5), ")\n") - } - } - - # Interpret interaction - if ("TIME:DOMAIN" %in% anova_table$Effect) { - interaction_p <- anova_table$p[anova_table$Effect == "TIME:DOMAIN"] - if (interaction_p < 0.05) { - cat("\nTIME × DOMAIN Interaction: The effect of time perspective on perceived change\n") - cat("varies across domains (p =", round(interaction_p, 5), ")\n") - } else { - cat("\nTIME × DOMAIN Interaction: No significant interaction between time perspective\n") - cat("and domain (p =", round(interaction_p, 5), ")\n") - } - } -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") diff --git a/.history/mixed anova - domain means_20250912125007.r b/.history/mixed anova - domain means_20250912125007.r deleted file mode 100644 index d51ac62..0000000 --- a/.history/mixed anova - domain means_20250912125007.r +++ /dev/null @@ -1,573 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -cat("\nChecking available domain mean columns:\n") -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - cat("Warning: Missing variables:", paste(missing_vars, collapse = ", "), "\n") -} else { - cat("All required domain mean variables found!\n") -} - -# ============================================================================= -# STEP 1: DATA PIVOTING TO LONG FORMAT -# ============================================================================= - -cat("STEP 1: DATA PIVOTING TO LONG FORMAT\n") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -cat("Domain mapping:\n") -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - cat("Warning: Variable", var_name, "not found in data\n") - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "GROUP", "TEMPORAL_DO", "ITEM_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$GROUP <- as.factor(long_data$GROUP) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - long_data$ITEM_DO <- as.factor(long_data$ITEM_DO) - - return(long_data) -} - -# Pivot data to long format -cat("\nPivoting data to long format...\n") -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) - cat("Data pivoting completed successfully.\n") -}, error = function(e) { - cat("Error in data pivoting:", e$message, "\n") - stop("Cannot proceed without proper data structure") -}) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique participants:", length(unique(long_data$pID)), "\n") -cat("TIME levels:", paste(levels(long_data$TIME), collapse = ", "), "\n") -cat("DOMAIN levels:", paste(levels(long_data$DOMAIN), collapse = ", "), "\n") - -# Display structure and sample -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(utils::head(long_data, 10)) - -# Show example data for one participant -cat("\nExample: Participant 1 across all domains and times:\n") -participant_1_data <- long_data[long_data$pID == 1, c("pID", "GROUP", "TEMPORAL_DO", "ITEM_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# STEP 2: ASSUMPTION CHECKING -# ============================================================================= - - -# 2.1 Check for missing values -cat("\n2.1 Missing Values Check:\n") -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("Missing values by TIME and DOMAIN:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\n2.2 Outlier Detection:\n") -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\n2.3 Normality Tests:\n") -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\n2.4 Homogeneity of Variance Tests:\n") - -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance across TIME within each DOMAIN:\n") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance across DOMAIN within each TIME:\n") -print(homogeneity_domain) - -# ============================================================================= -# STEP 3: DESCRIPTIVE STATISTICS -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# Overall descriptive statistics -desc_stats <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE), 5), - sd = round(sd(MEAN_DIFFERENCE), 5), - median = round(median(MEAN_DIFFERENCE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75), 5), - min = round(min(MEAN_DIFFERENCE), 5), - max = round(max(MEAN_DIFFERENCE), 5), - .groups = 'drop' - ) - -cat("Descriptive statistics by TIME and DOMAIN:\n") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_group <- long_data_clean %>% - group_by(GROUP, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE), 5), - sd = round(sd(MEAN_DIFFERENCE), 5), - .groups = 'drop' - ) - -cat("\nDescriptive statistics by GROUP, TIME, and DOMAIN:\n") -print(desc_stats_by_group) - -# ============================================================================= -# STEP 4: MIXED ANOVA ANALYSES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Main Mixed ANOVA -cat("\n4.1 Main Mixed ANOVA:\n") -cat("Within-subjects factors: TIME, DOMAIN\n") -cat("Between-subjects factors: GROUP, TEMPORAL_DO, ITEM_DO\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - main_anova <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = c(TIME, DOMAIN), - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Main ANOVA Results:\n") - print(main_anova) - - # Check sphericity - if (!is.null(main_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(main_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in main ANOVA:", e$message, "\n") - - # Try simpler model without all between-subjects factors - cat("Attempting simpler model with only GROUP as between-subjects factor...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = c(TIME, DOMAIN), - between = GROUP, - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Simplified ANOVA Results:\n") - print(simple_anova) - - main_anova <<- simple_anova - - }, error = function(e2) { - cat("Simplified ANOVA also failed:", e2$message, "\n") - }) -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain in levels(long_data_clean$DOMAIN)) { - cat("\nAnalyzing domain:", domain, "\n") - - domain_data <- long_data_clean[long_data_clean$DOMAIN == domain, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = TIME, - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain, ":\n") - print(domain_anova) - - domain_results[[domain]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = TIME, - between = GROUP, - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Time-specific analyses -cat("\n4.3 Time-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -time_results <- list() - -for (time in levels(long_data_clean$TIME)) { - cat("\nAnalyzing time:", time, "\n") - - time_data <- long_data_clean[long_data_clean$TIME == time, ] - - tryCatch({ - time_anova <- ezANOVA( - data = time_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = DOMAIN, - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", time, ":\n") - print(time_anova) - - time_results[[time]] <- time_anova - - }, error = function(e) { - cat("Error in ANOVA for", time, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = time_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = DOMAIN, - between = GROUP, - type = 3, - detailed = TRUE - ) - print(simple_anova) - time_results[[time]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# ============================================================================= -# STEP 5: POST-HOC ANALYSES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 5: POST-HOC ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 5.1 Pairwise comparisons for significant effects -if (exists("main_anova") && !is.null(main_anova)) { - cat("\n5.1 Post-hoc comparisons for main effects:\n") - - # Check for significant main effects and interactions - anova_table <- main_anova$ANOVA - - if ("TIME" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "TIME"] < 0.05) { - cat("Significant TIME main effect found. Computing pairwise comparisons...\n") - - # Simple paired t-tests for TIME effect - past_means <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - future_means <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - if (length(past_means) == length(future_means)) { - time_t_test <- t.test(past_means, future_means, paired = TRUE) - cat("Paired t-test for TIME effect:\n") - cat("t =", round(time_t_test$statistic, 5), - ", df =", time_t_test$parameter, - ", p =", round(time_t_test$p.value, 5), "\n") - cat("Mean difference (Past - Future):", round(time_t_test$estimate, 5), "\n") - } - } - - if ("DOMAIN" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "DOMAIN"] < 0.05) { - cat("Significant DOMAIN main effect found.\n") - - # Pairwise comparisons between domains - domain_means <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise(mean_diff = mean(MEAN_DIFFERENCE), .groups = 'drop') - - cat("Domain means:\n") - print(domain_means) - } - - if ("TIME:DOMAIN" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "TIME:DOMAIN"] < 0.05) { - cat("Significant TIME × DOMAIN interaction found.\n") - - # Simple effects analysis - interaction_means <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise(mean_diff = mean(MEAN_DIFFERENCE), .groups = 'drop') - - cat("TIME × DOMAIN interaction means:\n") - print(interaction_means) - } -} - -# ============================================================================= -# STEP 6: EFFECT SIZES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 6: EFFECT SIZES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -if (exists("main_anova") && !is.null(main_anova)) { - anova_table <- main_anova$ANOVA - - # Calculate partial eta squared for each effect - anova_table$partial_eta_squared <- round(anova_table$SSn / (anova_table$SSn + anova_table$SSd), 5) - - cat("Effect sizes (partial eta squared):\n") - effect_sizes <- anova_table[, c("Effect", "partial_eta_squared")] - print(effect_sizes) -} - -# ============================================================================= -# STEP 7: SUMMARY AND INTERPRETATION -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 7: SUMMARY AND INTERPRETATION\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -cat("Analysis Summary:\n") -cat("- Total participants:", length(unique(long_data_clean$pID)), "\n") -cat("- Total observations:", nrow(long_data_clean), "\n") -cat("- Within-subjects factors: TIME (Past vs Future), DOMAIN (Preferences, Personality, Values, Life)\n") -cat("- Between-subjects factors: GROUP, TEMPORAL_DO, ITEM_DO\n") -cat("- Dependent variable: Mean absolute differences in domain ratings\n") - -cat("\nResearch Question:\n") -cat("Do participants rate changes in domains differently from past to now vs past to future?\n") - -if (exists("main_anova") && !is.null(main_anova)) { - anova_table <- main_anova$ANOVA - - cat("\nKey Findings:\n") - - # Check for significant effects - significant_effects <- anova_table$Effect[anova_table$p < 0.05] - - if (length(significant_effects) > 0) { - cat("Significant effects found:\n") - for (effect in significant_effects) { - p_val <- anova_table$p[anova_table$Effect == effect] - cat("-", effect, "(p =", round(p_val, 5), ")\n") - } - } else { - cat("No significant effects found at α = 0.05\n") - } - - # Interpret TIME effect - if ("TIME" %in% anova_table$Effect) { - time_p <- anova_table$p[anova_table$Effect == "TIME"] - if (time_p < 0.05) { - cat("\nTIME Effect: Participants show different levels of change when comparing\n") - cat("past-to-now vs past-to-future perspectives (p =", round(time_p, 5), ")\n") - } else { - cat("\nTIME Effect: No significant difference between past-to-now and past-to-future\n") - cat("perspectives (p =", round(time_p, 5), ")\n") - } - } - - # Interpret DOMAIN effect - if ("DOMAIN" %in% anova_table$Effect) { - domain_p <- anova_table$p[anova_table$Effect == "DOMAIN"] - if (domain_p < 0.05) { - cat("\nDOMAIN Effect: Different domains show different levels of perceived change\n") - cat("(p =", round(domain_p, 5), ")\n") - } else { - cat("\nDOMAIN Effect: No significant differences between domains in perceived change\n") - cat("(p =", round(domain_p, 5), ")\n") - } - } - - # Interpret interaction - if ("TIME:DOMAIN" %in% anova_table$Effect) { - interaction_p <- anova_table$p[anova_table$Effect == "TIME:DOMAIN"] - if (interaction_p < 0.05) { - cat("\nTIME × DOMAIN Interaction: The effect of time perspective on perceived change\n") - cat("varies across domains (p =", round(interaction_p, 5), ")\n") - } else { - cat("\nTIME × DOMAIN Interaction: No significant interaction between time perspective\n") - cat("and domain (p =", round(interaction_p, 5), ")\n") - } - } -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") diff --git a/.history/mixed anova - domain means_20250912125012.r b/.history/mixed anova - domain means_20250912125012.r deleted file mode 100644 index f73fe73..0000000 --- a/.history/mixed anova - domain means_20250912125012.r +++ /dev/null @@ -1,580 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -cat("\nChecking available domain mean columns:\n") -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - cat("Warning: Missing variables:", paste(missing_vars, collapse = ", "), "\n") -} else { - cat("All required domain mean variables found!\n") -} - -# ============================================================================= -# STEP 1: DATA PIVOTING TO LONG FORMAT -# ============================================================================= - -cat("STEP 1: DATA PIVOTING TO LONG FORMAT\n") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -cat("Domain mapping:\n") -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - cat("Warning: Variable", var_name, "not found in data\n") - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "GROUP", "TEMPORAL_DO", "ITEM_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$GROUP <- as.factor(long_data$GROUP) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - long_data$ITEM_DO <- as.factor(long_data$ITEM_DO) - - return(long_data) -} - -# Pivot data to long format -cat("\nPivoting data to long format...\n") -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) - cat("Data pivoting completed successfully.\n") -}, error = function(e) { - cat("Error in data pivoting:", e$message, "\n") - stop("Cannot proceed without proper data structure") -}) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique participants:", length(unique(long_data$pID)), "\n") -cat("TIME levels:", paste(levels(long_data$TIME), collapse = ", "), "\n") -cat("DOMAIN levels:", paste(levels(long_data$DOMAIN), collapse = ", "), "\n") - -# Check data types -cat("\nData types check:\n") -cat("TIME is factor:", is.factor(long_data$TIME), "\n") -cat("DOMAIN is factor:", is.factor(long_data$DOMAIN), "\n") -cat("pID is factor:", is.factor(long_data$pID), "\n") -cat("MEAN_DIFFERENCE is numeric:", is.numeric(long_data$MEAN_DIFFERENCE), "\n") - -# Display structure and sample -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(utils::head(long_data, 10)) - -# Show example data for one participant -cat("\nExample: Participant 1 across all domains and times:\n") -participant_1_data <- long_data[long_data$pID == 1, c("pID", "GROUP", "TEMPORAL_DO", "ITEM_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# STEP 2: ASSUMPTION CHECKING -# ============================================================================= - - -# 2.1 Check for missing values -cat("\n2.1 Missing Values Check:\n") -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("Missing values by TIME and DOMAIN:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\n2.2 Outlier Detection:\n") -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\n2.3 Normality Tests:\n") -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\n2.4 Homogeneity of Variance Tests:\n") - -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance across TIME within each DOMAIN:\n") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance across DOMAIN within each TIME:\n") -print(homogeneity_domain) - -# ============================================================================= -# STEP 3: DESCRIPTIVE STATISTICS -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# Overall descriptive statistics -desc_stats <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE), 5), - sd = round(sd(MEAN_DIFFERENCE), 5), - median = round(median(MEAN_DIFFERENCE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75), 5), - min = round(min(MEAN_DIFFERENCE), 5), - max = round(max(MEAN_DIFFERENCE), 5), - .groups = 'drop' - ) - -cat("Descriptive statistics by TIME and DOMAIN:\n") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_group <- long_data_clean %>% - group_by(GROUP, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE), 5), - sd = round(sd(MEAN_DIFFERENCE), 5), - .groups = 'drop' - ) - -cat("\nDescriptive statistics by GROUP, TIME, and DOMAIN:\n") -print(desc_stats_by_group) - -# ============================================================================= -# STEP 4: MIXED ANOVA ANALYSES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Main Mixed ANOVA -cat("\n4.1 Main Mixed ANOVA:\n") -cat("Within-subjects factors: TIME, DOMAIN\n") -cat("Between-subjects factors: GROUP, TEMPORAL_DO, ITEM_DO\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - main_anova <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = c(TIME, DOMAIN), - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Main ANOVA Results:\n") - print(main_anova) - - # Check sphericity - if (!is.null(main_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(main_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in main ANOVA:", e$message, "\n") - - # Try simpler model without all between-subjects factors - cat("Attempting simpler model with only GROUP as between-subjects factor...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = c(TIME, DOMAIN), - between = GROUP, - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Simplified ANOVA Results:\n") - print(simple_anova) - - main_anova <<- simple_anova - - }, error = function(e2) { - cat("Simplified ANOVA also failed:", e2$message, "\n") - }) -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain in levels(long_data_clean$DOMAIN)) { - cat("\nAnalyzing domain:", domain, "\n") - - domain_data <- long_data_clean[long_data_clean$DOMAIN == domain, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = TIME, - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain, ":\n") - print(domain_anova) - - domain_results[[domain]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = TIME, - between = GROUP, - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Time-specific analyses -cat("\n4.3 Time-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -time_results <- list() - -for (time in levels(long_data_clean$TIME)) { - cat("\nAnalyzing time:", time, "\n") - - time_data <- long_data_clean[long_data_clean$TIME == time, ] - - tryCatch({ - time_anova <- ezANOVA( - data = time_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = DOMAIN, - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", time, ":\n") - print(time_anova) - - time_results[[time]] <- time_anova - - }, error = function(e) { - cat("Error in ANOVA for", time, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = time_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = DOMAIN, - between = GROUP, - type = 3, - detailed = TRUE - ) - print(simple_anova) - time_results[[time]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# ============================================================================= -# STEP 5: POST-HOC ANALYSES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 5: POST-HOC ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 5.1 Pairwise comparisons for significant effects -if (exists("main_anova") && !is.null(main_anova)) { - cat("\n5.1 Post-hoc comparisons for main effects:\n") - - # Check for significant main effects and interactions - anova_table <- main_anova$ANOVA - - if ("TIME" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "TIME"] < 0.05) { - cat("Significant TIME main effect found. Computing pairwise comparisons...\n") - - # Simple paired t-tests for TIME effect - past_means <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - future_means <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - if (length(past_means) == length(future_means)) { - time_t_test <- t.test(past_means, future_means, paired = TRUE) - cat("Paired t-test for TIME effect:\n") - cat("t =", round(time_t_test$statistic, 5), - ", df =", time_t_test$parameter, - ", p =", round(time_t_test$p.value, 5), "\n") - cat("Mean difference (Past - Future):", round(time_t_test$estimate, 5), "\n") - } - } - - if ("DOMAIN" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "DOMAIN"] < 0.05) { - cat("Significant DOMAIN main effect found.\n") - - # Pairwise comparisons between domains - domain_means <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise(mean_diff = mean(MEAN_DIFFERENCE), .groups = 'drop') - - cat("Domain means:\n") - print(domain_means) - } - - if ("TIME:DOMAIN" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "TIME:DOMAIN"] < 0.05) { - cat("Significant TIME × DOMAIN interaction found.\n") - - # Simple effects analysis - interaction_means <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise(mean_diff = mean(MEAN_DIFFERENCE), .groups = 'drop') - - cat("TIME × DOMAIN interaction means:\n") - print(interaction_means) - } -} - -# ============================================================================= -# STEP 6: EFFECT SIZES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 6: EFFECT SIZES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -if (exists("main_anova") && !is.null(main_anova)) { - anova_table <- main_anova$ANOVA - - # Calculate partial eta squared for each effect - anova_table$partial_eta_squared <- round(anova_table$SSn / (anova_table$SSn + anova_table$SSd), 5) - - cat("Effect sizes (partial eta squared):\n") - effect_sizes <- anova_table[, c("Effect", "partial_eta_squared")] - print(effect_sizes) -} - -# ============================================================================= -# STEP 7: SUMMARY AND INTERPRETATION -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 7: SUMMARY AND INTERPRETATION\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -cat("Analysis Summary:\n") -cat("- Total participants:", length(unique(long_data_clean$pID)), "\n") -cat("- Total observations:", nrow(long_data_clean), "\n") -cat("- Within-subjects factors: TIME (Past vs Future), DOMAIN (Preferences, Personality, Values, Life)\n") -cat("- Between-subjects factors: GROUP, TEMPORAL_DO, ITEM_DO\n") -cat("- Dependent variable: Mean absolute differences in domain ratings\n") - -cat("\nResearch Question:\n") -cat("Do participants rate changes in domains differently from past to now vs past to future?\n") - -if (exists("main_anova") && !is.null(main_anova)) { - anova_table <- main_anova$ANOVA - - cat("\nKey Findings:\n") - - # Check for significant effects - significant_effects <- anova_table$Effect[anova_table$p < 0.05] - - if (length(significant_effects) > 0) { - cat("Significant effects found:\n") - for (effect in significant_effects) { - p_val <- anova_table$p[anova_table$Effect == effect] - cat("-", effect, "(p =", round(p_val, 5), ")\n") - } - } else { - cat("No significant effects found at α = 0.05\n") - } - - # Interpret TIME effect - if ("TIME" %in% anova_table$Effect) { - time_p <- anova_table$p[anova_table$Effect == "TIME"] - if (time_p < 0.05) { - cat("\nTIME Effect: Participants show different levels of change when comparing\n") - cat("past-to-now vs past-to-future perspectives (p =", round(time_p, 5), ")\n") - } else { - cat("\nTIME Effect: No significant difference between past-to-now and past-to-future\n") - cat("perspectives (p =", round(time_p, 5), ")\n") - } - } - - # Interpret DOMAIN effect - if ("DOMAIN" %in% anova_table$Effect) { - domain_p <- anova_table$p[anova_table$Effect == "DOMAIN"] - if (domain_p < 0.05) { - cat("\nDOMAIN Effect: Different domains show different levels of perceived change\n") - cat("(p =", round(domain_p, 5), ")\n") - } else { - cat("\nDOMAIN Effect: No significant differences between domains in perceived change\n") - cat("(p =", round(domain_p, 5), ")\n") - } - } - - # Interpret interaction - if ("TIME:DOMAIN" %in% anova_table$Effect) { - interaction_p <- anova_table$p[anova_table$Effect == "TIME:DOMAIN"] - if (interaction_p < 0.05) { - cat("\nTIME × DOMAIN Interaction: The effect of time perspective on perceived change\n") - cat("varies across domains (p =", round(interaction_p, 5), ")\n") - } else { - cat("\nTIME × DOMAIN Interaction: No significant interaction between time perspective\n") - cat("and domain (p =", round(interaction_p, 5), ")\n") - } - } -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") diff --git a/.history/mixed anova - domain means_20250912125017.r b/.history/mixed anova - domain means_20250912125017.r deleted file mode 100644 index f73fe73..0000000 --- a/.history/mixed anova - domain means_20250912125017.r +++ /dev/null @@ -1,580 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -cat("\nChecking available domain mean columns:\n") -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - cat("Warning: Missing variables:", paste(missing_vars, collapse = ", "), "\n") -} else { - cat("All required domain mean variables found!\n") -} - -# ============================================================================= -# STEP 1: DATA PIVOTING TO LONG FORMAT -# ============================================================================= - -cat("STEP 1: DATA PIVOTING TO LONG FORMAT\n") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -cat("Domain mapping:\n") -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - cat("Warning: Variable", var_name, "not found in data\n") - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "GROUP", "TEMPORAL_DO", "ITEM_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$GROUP <- as.factor(long_data$GROUP) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - long_data$ITEM_DO <- as.factor(long_data$ITEM_DO) - - return(long_data) -} - -# Pivot data to long format -cat("\nPivoting data to long format...\n") -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) - cat("Data pivoting completed successfully.\n") -}, error = function(e) { - cat("Error in data pivoting:", e$message, "\n") - stop("Cannot proceed without proper data structure") -}) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique participants:", length(unique(long_data$pID)), "\n") -cat("TIME levels:", paste(levels(long_data$TIME), collapse = ", "), "\n") -cat("DOMAIN levels:", paste(levels(long_data$DOMAIN), collapse = ", "), "\n") - -# Check data types -cat("\nData types check:\n") -cat("TIME is factor:", is.factor(long_data$TIME), "\n") -cat("DOMAIN is factor:", is.factor(long_data$DOMAIN), "\n") -cat("pID is factor:", is.factor(long_data$pID), "\n") -cat("MEAN_DIFFERENCE is numeric:", is.numeric(long_data$MEAN_DIFFERENCE), "\n") - -# Display structure and sample -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(utils::head(long_data, 10)) - -# Show example data for one participant -cat("\nExample: Participant 1 across all domains and times:\n") -participant_1_data <- long_data[long_data$pID == 1, c("pID", "GROUP", "TEMPORAL_DO", "ITEM_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# STEP 2: ASSUMPTION CHECKING -# ============================================================================= - - -# 2.1 Check for missing values -cat("\n2.1 Missing Values Check:\n") -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("Missing values by TIME and DOMAIN:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\n2.2 Outlier Detection:\n") -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\n2.3 Normality Tests:\n") -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\n2.4 Homogeneity of Variance Tests:\n") - -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance across TIME within each DOMAIN:\n") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance across DOMAIN within each TIME:\n") -print(homogeneity_domain) - -# ============================================================================= -# STEP 3: DESCRIPTIVE STATISTICS -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# Overall descriptive statistics -desc_stats <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE), 5), - sd = round(sd(MEAN_DIFFERENCE), 5), - median = round(median(MEAN_DIFFERENCE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75), 5), - min = round(min(MEAN_DIFFERENCE), 5), - max = round(max(MEAN_DIFFERENCE), 5), - .groups = 'drop' - ) - -cat("Descriptive statistics by TIME and DOMAIN:\n") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_group <- long_data_clean %>% - group_by(GROUP, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE), 5), - sd = round(sd(MEAN_DIFFERENCE), 5), - .groups = 'drop' - ) - -cat("\nDescriptive statistics by GROUP, TIME, and DOMAIN:\n") -print(desc_stats_by_group) - -# ============================================================================= -# STEP 4: MIXED ANOVA ANALYSES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Main Mixed ANOVA -cat("\n4.1 Main Mixed ANOVA:\n") -cat("Within-subjects factors: TIME, DOMAIN\n") -cat("Between-subjects factors: GROUP, TEMPORAL_DO, ITEM_DO\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - main_anova <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = c(TIME, DOMAIN), - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Main ANOVA Results:\n") - print(main_anova) - - # Check sphericity - if (!is.null(main_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(main_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in main ANOVA:", e$message, "\n") - - # Try simpler model without all between-subjects factors - cat("Attempting simpler model with only GROUP as between-subjects factor...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = c(TIME, DOMAIN), - between = GROUP, - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Simplified ANOVA Results:\n") - print(simple_anova) - - main_anova <<- simple_anova - - }, error = function(e2) { - cat("Simplified ANOVA also failed:", e2$message, "\n") - }) -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain in levels(long_data_clean$DOMAIN)) { - cat("\nAnalyzing domain:", domain, "\n") - - domain_data <- long_data_clean[long_data_clean$DOMAIN == domain, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = TIME, - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain, ":\n") - print(domain_anova) - - domain_results[[domain]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = TIME, - between = GROUP, - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Time-specific analyses -cat("\n4.3 Time-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -time_results <- list() - -for (time in levels(long_data_clean$TIME)) { - cat("\nAnalyzing time:", time, "\n") - - time_data <- long_data_clean[long_data_clean$TIME == time, ] - - tryCatch({ - time_anova <- ezANOVA( - data = time_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = DOMAIN, - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", time, ":\n") - print(time_anova) - - time_results[[time]] <- time_anova - - }, error = function(e) { - cat("Error in ANOVA for", time, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = time_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = DOMAIN, - between = GROUP, - type = 3, - detailed = TRUE - ) - print(simple_anova) - time_results[[time]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# ============================================================================= -# STEP 5: POST-HOC ANALYSES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 5: POST-HOC ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 5.1 Pairwise comparisons for significant effects -if (exists("main_anova") && !is.null(main_anova)) { - cat("\n5.1 Post-hoc comparisons for main effects:\n") - - # Check for significant main effects and interactions - anova_table <- main_anova$ANOVA - - if ("TIME" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "TIME"] < 0.05) { - cat("Significant TIME main effect found. Computing pairwise comparisons...\n") - - # Simple paired t-tests for TIME effect - past_means <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - future_means <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - if (length(past_means) == length(future_means)) { - time_t_test <- t.test(past_means, future_means, paired = TRUE) - cat("Paired t-test for TIME effect:\n") - cat("t =", round(time_t_test$statistic, 5), - ", df =", time_t_test$parameter, - ", p =", round(time_t_test$p.value, 5), "\n") - cat("Mean difference (Past - Future):", round(time_t_test$estimate, 5), "\n") - } - } - - if ("DOMAIN" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "DOMAIN"] < 0.05) { - cat("Significant DOMAIN main effect found.\n") - - # Pairwise comparisons between domains - domain_means <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise(mean_diff = mean(MEAN_DIFFERENCE), .groups = 'drop') - - cat("Domain means:\n") - print(domain_means) - } - - if ("TIME:DOMAIN" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "TIME:DOMAIN"] < 0.05) { - cat("Significant TIME × DOMAIN interaction found.\n") - - # Simple effects analysis - interaction_means <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise(mean_diff = mean(MEAN_DIFFERENCE), .groups = 'drop') - - cat("TIME × DOMAIN interaction means:\n") - print(interaction_means) - } -} - -# ============================================================================= -# STEP 6: EFFECT SIZES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 6: EFFECT SIZES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -if (exists("main_anova") && !is.null(main_anova)) { - anova_table <- main_anova$ANOVA - - # Calculate partial eta squared for each effect - anova_table$partial_eta_squared <- round(anova_table$SSn / (anova_table$SSn + anova_table$SSd), 5) - - cat("Effect sizes (partial eta squared):\n") - effect_sizes <- anova_table[, c("Effect", "partial_eta_squared")] - print(effect_sizes) -} - -# ============================================================================= -# STEP 7: SUMMARY AND INTERPRETATION -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 7: SUMMARY AND INTERPRETATION\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -cat("Analysis Summary:\n") -cat("- Total participants:", length(unique(long_data_clean$pID)), "\n") -cat("- Total observations:", nrow(long_data_clean), "\n") -cat("- Within-subjects factors: TIME (Past vs Future), DOMAIN (Preferences, Personality, Values, Life)\n") -cat("- Between-subjects factors: GROUP, TEMPORAL_DO, ITEM_DO\n") -cat("- Dependent variable: Mean absolute differences in domain ratings\n") - -cat("\nResearch Question:\n") -cat("Do participants rate changes in domains differently from past to now vs past to future?\n") - -if (exists("main_anova") && !is.null(main_anova)) { - anova_table <- main_anova$ANOVA - - cat("\nKey Findings:\n") - - # Check for significant effects - significant_effects <- anova_table$Effect[anova_table$p < 0.05] - - if (length(significant_effects) > 0) { - cat("Significant effects found:\n") - for (effect in significant_effects) { - p_val <- anova_table$p[anova_table$Effect == effect] - cat("-", effect, "(p =", round(p_val, 5), ")\n") - } - } else { - cat("No significant effects found at α = 0.05\n") - } - - # Interpret TIME effect - if ("TIME" %in% anova_table$Effect) { - time_p <- anova_table$p[anova_table$Effect == "TIME"] - if (time_p < 0.05) { - cat("\nTIME Effect: Participants show different levels of change when comparing\n") - cat("past-to-now vs past-to-future perspectives (p =", round(time_p, 5), ")\n") - } else { - cat("\nTIME Effect: No significant difference between past-to-now and past-to-future\n") - cat("perspectives (p =", round(time_p, 5), ")\n") - } - } - - # Interpret DOMAIN effect - if ("DOMAIN" %in% anova_table$Effect) { - domain_p <- anova_table$p[anova_table$Effect == "DOMAIN"] - if (domain_p < 0.05) { - cat("\nDOMAIN Effect: Different domains show different levels of perceived change\n") - cat("(p =", round(domain_p, 5), ")\n") - } else { - cat("\nDOMAIN Effect: No significant differences between domains in perceived change\n") - cat("(p =", round(domain_p, 5), ")\n") - } - } - - # Interpret interaction - if ("TIME:DOMAIN" %in% anova_table$Effect) { - interaction_p <- anova_table$p[anova_table$Effect == "TIME:DOMAIN"] - if (interaction_p < 0.05) { - cat("\nTIME × DOMAIN Interaction: The effect of time perspective on perceived change\n") - cat("varies across domains (p =", round(interaction_p, 5), ")\n") - } else { - cat("\nTIME × DOMAIN Interaction: No significant interaction between time perspective\n") - cat("and domain (p =", round(interaction_p, 5), ")\n") - } - } -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") diff --git a/.history/mixed anova - domain means_20250912125031.r b/.history/mixed anova - domain means_20250912125031.r deleted file mode 100644 index 8f5ec66..0000000 --- a/.history/mixed anova - domain means_20250912125031.r +++ /dev/null @@ -1,584 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -cat("\nChecking available domain mean columns:\n") -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - cat("Warning: Missing variables:", paste(missing_vars, collapse = ", "), "\n") -} else { - cat("All required domain mean variables found!\n") -} - -# ============================================================================= -# STEP 1: DATA PIVOTING TO LONG FORMAT -# ============================================================================= - -cat("STEP 1: DATA PIVOTING TO LONG FORMAT\n") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -cat("Domain mapping:\n") -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - cat("Warning: Variable", var_name, "not found in data\n") - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "GROUP", "TEMPORAL_DO", "ITEM_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$GROUP <- as.factor(long_data$GROUP) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - long_data$ITEM_DO <- as.factor(long_data$ITEM_DO) - - return(long_data) -} - -# Pivot data to long format -cat("\nPivoting data to long format...\n") -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) - cat("Data pivoting completed successfully.\n") -}, error = function(e) { - cat("Error in data pivoting:", e$message, "\n") - stop("Cannot proceed without proper data structure") -}) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique participants:", length(unique(long_data$pID)), "\n") -cat("TIME levels:", paste(levels(long_data$TIME), collapse = ", "), "\n") -cat("DOMAIN levels:", paste(levels(long_data$DOMAIN), collapse = ", "), "\n") - -# Check data types -cat("\nData types check:\n") -cat("TIME is factor:", is.factor(long_data$TIME), "\n") -cat("DOMAIN is factor:", is.factor(long_data$DOMAIN), "\n") -cat("pID is factor:", is.factor(long_data$pID), "\n") -cat("MEAN_DIFFERENCE is numeric:", is.numeric(long_data$MEAN_DIFFERENCE), "\n") - -# Show first 20 rows -cat("\nFirst 20 rows of long_data:\n") -print(utils::head(long_data, 20)) - -# Display structure and sample -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(utils::head(long_data, 10)) - -# Show example data for one participant -cat("\nExample: Participant 1 across all domains and times:\n") -participant_1_data <- long_data[long_data$pID == 1, c("pID", "GROUP", "TEMPORAL_DO", "ITEM_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# STEP 2: ASSUMPTION CHECKING -# ============================================================================= - - -# 2.1 Check for missing values -cat("\n2.1 Missing Values Check:\n") -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("Missing values by TIME and DOMAIN:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\n2.2 Outlier Detection:\n") -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\n2.3 Normality Tests:\n") -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\n2.4 Homogeneity of Variance Tests:\n") - -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance across TIME within each DOMAIN:\n") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance across DOMAIN within each TIME:\n") -print(homogeneity_domain) - -# ============================================================================= -# STEP 3: DESCRIPTIVE STATISTICS -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# Overall descriptive statistics -desc_stats <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE), 5), - sd = round(sd(MEAN_DIFFERENCE), 5), - median = round(median(MEAN_DIFFERENCE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75), 5), - min = round(min(MEAN_DIFFERENCE), 5), - max = round(max(MEAN_DIFFERENCE), 5), - .groups = 'drop' - ) - -cat("Descriptive statistics by TIME and DOMAIN:\n") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_group <- long_data_clean %>% - group_by(GROUP, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE), 5), - sd = round(sd(MEAN_DIFFERENCE), 5), - .groups = 'drop' - ) - -cat("\nDescriptive statistics by GROUP, TIME, and DOMAIN:\n") -print(desc_stats_by_group) - -# ============================================================================= -# STEP 4: MIXED ANOVA ANALYSES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Main Mixed ANOVA -cat("\n4.1 Main Mixed ANOVA:\n") -cat("Within-subjects factors: TIME, DOMAIN\n") -cat("Between-subjects factors: GROUP, TEMPORAL_DO, ITEM_DO\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - main_anova <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = c(TIME, DOMAIN), - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Main ANOVA Results:\n") - print(main_anova) - - # Check sphericity - if (!is.null(main_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(main_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in main ANOVA:", e$message, "\n") - - # Try simpler model without all between-subjects factors - cat("Attempting simpler model with only GROUP as between-subjects factor...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = c(TIME, DOMAIN), - between = GROUP, - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Simplified ANOVA Results:\n") - print(simple_anova) - - main_anova <<- simple_anova - - }, error = function(e2) { - cat("Simplified ANOVA also failed:", e2$message, "\n") - }) -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain in levels(long_data_clean$DOMAIN)) { - cat("\nAnalyzing domain:", domain, "\n") - - domain_data <- long_data_clean[long_data_clean$DOMAIN == domain, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = TIME, - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain, ":\n") - print(domain_anova) - - domain_results[[domain]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = TIME, - between = GROUP, - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Time-specific analyses -cat("\n4.3 Time-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -time_results <- list() - -for (time in levels(long_data_clean$TIME)) { - cat("\nAnalyzing time:", time, "\n") - - time_data <- long_data_clean[long_data_clean$TIME == time, ] - - tryCatch({ - time_anova <- ezANOVA( - data = time_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = DOMAIN, - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", time, ":\n") - print(time_anova) - - time_results[[time]] <- time_anova - - }, error = function(e) { - cat("Error in ANOVA for", time, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = time_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = DOMAIN, - between = GROUP, - type = 3, - detailed = TRUE - ) - print(simple_anova) - time_results[[time]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# ============================================================================= -# STEP 5: POST-HOC ANALYSES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 5: POST-HOC ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 5.1 Pairwise comparisons for significant effects -if (exists("main_anova") && !is.null(main_anova)) { - cat("\n5.1 Post-hoc comparisons for main effects:\n") - - # Check for significant main effects and interactions - anova_table <- main_anova$ANOVA - - if ("TIME" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "TIME"] < 0.05) { - cat("Significant TIME main effect found. Computing pairwise comparisons...\n") - - # Simple paired t-tests for TIME effect - past_means <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - future_means <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - if (length(past_means) == length(future_means)) { - time_t_test <- t.test(past_means, future_means, paired = TRUE) - cat("Paired t-test for TIME effect:\n") - cat("t =", round(time_t_test$statistic, 5), - ", df =", time_t_test$parameter, - ", p =", round(time_t_test$p.value, 5), "\n") - cat("Mean difference (Past - Future):", round(time_t_test$estimate, 5), "\n") - } - } - - if ("DOMAIN" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "DOMAIN"] < 0.05) { - cat("Significant DOMAIN main effect found.\n") - - # Pairwise comparisons between domains - domain_means <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise(mean_diff = mean(MEAN_DIFFERENCE), .groups = 'drop') - - cat("Domain means:\n") - print(domain_means) - } - - if ("TIME:DOMAIN" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "TIME:DOMAIN"] < 0.05) { - cat("Significant TIME × DOMAIN interaction found.\n") - - # Simple effects analysis - interaction_means <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise(mean_diff = mean(MEAN_DIFFERENCE), .groups = 'drop') - - cat("TIME × DOMAIN interaction means:\n") - print(interaction_means) - } -} - -# ============================================================================= -# STEP 6: EFFECT SIZES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 6: EFFECT SIZES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -if (exists("main_anova") && !is.null(main_anova)) { - anova_table <- main_anova$ANOVA - - # Calculate partial eta squared for each effect - anova_table$partial_eta_squared <- round(anova_table$SSn / (anova_table$SSn + anova_table$SSd), 5) - - cat("Effect sizes (partial eta squared):\n") - effect_sizes <- anova_table[, c("Effect", "partial_eta_squared")] - print(effect_sizes) -} - -# ============================================================================= -# STEP 7: SUMMARY AND INTERPRETATION -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 7: SUMMARY AND INTERPRETATION\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -cat("Analysis Summary:\n") -cat("- Total participants:", length(unique(long_data_clean$pID)), "\n") -cat("- Total observations:", nrow(long_data_clean), "\n") -cat("- Within-subjects factors: TIME (Past vs Future), DOMAIN (Preferences, Personality, Values, Life)\n") -cat("- Between-subjects factors: GROUP, TEMPORAL_DO, ITEM_DO\n") -cat("- Dependent variable: Mean absolute differences in domain ratings\n") - -cat("\nResearch Question:\n") -cat("Do participants rate changes in domains differently from past to now vs past to future?\n") - -if (exists("main_anova") && !is.null(main_anova)) { - anova_table <- main_anova$ANOVA - - cat("\nKey Findings:\n") - - # Check for significant effects - significant_effects <- anova_table$Effect[anova_table$p < 0.05] - - if (length(significant_effects) > 0) { - cat("Significant effects found:\n") - for (effect in significant_effects) { - p_val <- anova_table$p[anova_table$Effect == effect] - cat("-", effect, "(p =", round(p_val, 5), ")\n") - } - } else { - cat("No significant effects found at α = 0.05\n") - } - - # Interpret TIME effect - if ("TIME" %in% anova_table$Effect) { - time_p <- anova_table$p[anova_table$Effect == "TIME"] - if (time_p < 0.05) { - cat("\nTIME Effect: Participants show different levels of change when comparing\n") - cat("past-to-now vs past-to-future perspectives (p =", round(time_p, 5), ")\n") - } else { - cat("\nTIME Effect: No significant difference between past-to-now and past-to-future\n") - cat("perspectives (p =", round(time_p, 5), ")\n") - } - } - - # Interpret DOMAIN effect - if ("DOMAIN" %in% anova_table$Effect) { - domain_p <- anova_table$p[anova_table$Effect == "DOMAIN"] - if (domain_p < 0.05) { - cat("\nDOMAIN Effect: Different domains show different levels of perceived change\n") - cat("(p =", round(domain_p, 5), ")\n") - } else { - cat("\nDOMAIN Effect: No significant differences between domains in perceived change\n") - cat("(p =", round(domain_p, 5), ")\n") - } - } - - # Interpret interaction - if ("TIME:DOMAIN" %in% anova_table$Effect) { - interaction_p <- anova_table$p[anova_table$Effect == "TIME:DOMAIN"] - if (interaction_p < 0.05) { - cat("\nTIME × DOMAIN Interaction: The effect of time perspective on perceived change\n") - cat("varies across domains (p =", round(interaction_p, 5), ")\n") - } else { - cat("\nTIME × DOMAIN Interaction: No significant interaction between time perspective\n") - cat("and domain (p =", round(interaction_p, 5), ")\n") - } - } -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") diff --git a/.history/mixed anova - domain means_20250912125040.r b/.history/mixed anova - domain means_20250912125040.r deleted file mode 100644 index 8f5ec66..0000000 --- a/.history/mixed anova - domain means_20250912125040.r +++ /dev/null @@ -1,584 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -cat("\nChecking available domain mean columns:\n") -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - cat("Warning: Missing variables:", paste(missing_vars, collapse = ", "), "\n") -} else { - cat("All required domain mean variables found!\n") -} - -# ============================================================================= -# STEP 1: DATA PIVOTING TO LONG FORMAT -# ============================================================================= - -cat("STEP 1: DATA PIVOTING TO LONG FORMAT\n") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -cat("Domain mapping:\n") -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - cat("Warning: Variable", var_name, "not found in data\n") - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "GROUP", "TEMPORAL_DO", "ITEM_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$GROUP <- as.factor(long_data$GROUP) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - long_data$ITEM_DO <- as.factor(long_data$ITEM_DO) - - return(long_data) -} - -# Pivot data to long format -cat("\nPivoting data to long format...\n") -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) - cat("Data pivoting completed successfully.\n") -}, error = function(e) { - cat("Error in data pivoting:", e$message, "\n") - stop("Cannot proceed without proper data structure") -}) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique participants:", length(unique(long_data$pID)), "\n") -cat("TIME levels:", paste(levels(long_data$TIME), collapse = ", "), "\n") -cat("DOMAIN levels:", paste(levels(long_data$DOMAIN), collapse = ", "), "\n") - -# Check data types -cat("\nData types check:\n") -cat("TIME is factor:", is.factor(long_data$TIME), "\n") -cat("DOMAIN is factor:", is.factor(long_data$DOMAIN), "\n") -cat("pID is factor:", is.factor(long_data$pID), "\n") -cat("MEAN_DIFFERENCE is numeric:", is.numeric(long_data$MEAN_DIFFERENCE), "\n") - -# Show first 20 rows -cat("\nFirst 20 rows of long_data:\n") -print(utils::head(long_data, 20)) - -# Display structure and sample -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(utils::head(long_data, 10)) - -# Show example data for one participant -cat("\nExample: Participant 1 across all domains and times:\n") -participant_1_data <- long_data[long_data$pID == 1, c("pID", "GROUP", "TEMPORAL_DO", "ITEM_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# STEP 2: ASSUMPTION CHECKING -# ============================================================================= - - -# 2.1 Check for missing values -cat("\n2.1 Missing Values Check:\n") -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("Missing values by TIME and DOMAIN:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\n2.2 Outlier Detection:\n") -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\n2.3 Normality Tests:\n") -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\n2.4 Homogeneity of Variance Tests:\n") - -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance across TIME within each DOMAIN:\n") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance across DOMAIN within each TIME:\n") -print(homogeneity_domain) - -# ============================================================================= -# STEP 3: DESCRIPTIVE STATISTICS -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# Overall descriptive statistics -desc_stats <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE), 5), - sd = round(sd(MEAN_DIFFERENCE), 5), - median = round(median(MEAN_DIFFERENCE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75), 5), - min = round(min(MEAN_DIFFERENCE), 5), - max = round(max(MEAN_DIFFERENCE), 5), - .groups = 'drop' - ) - -cat("Descriptive statistics by TIME and DOMAIN:\n") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_group <- long_data_clean %>% - group_by(GROUP, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE), 5), - sd = round(sd(MEAN_DIFFERENCE), 5), - .groups = 'drop' - ) - -cat("\nDescriptive statistics by GROUP, TIME, and DOMAIN:\n") -print(desc_stats_by_group) - -# ============================================================================= -# STEP 4: MIXED ANOVA ANALYSES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Main Mixed ANOVA -cat("\n4.1 Main Mixed ANOVA:\n") -cat("Within-subjects factors: TIME, DOMAIN\n") -cat("Between-subjects factors: GROUP, TEMPORAL_DO, ITEM_DO\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - main_anova <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = c(TIME, DOMAIN), - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Main ANOVA Results:\n") - print(main_anova) - - # Check sphericity - if (!is.null(main_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(main_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in main ANOVA:", e$message, "\n") - - # Try simpler model without all between-subjects factors - cat("Attempting simpler model with only GROUP as between-subjects factor...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = c(TIME, DOMAIN), - between = GROUP, - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Simplified ANOVA Results:\n") - print(simple_anova) - - main_anova <<- simple_anova - - }, error = function(e2) { - cat("Simplified ANOVA also failed:", e2$message, "\n") - }) -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain in levels(long_data_clean$DOMAIN)) { - cat("\nAnalyzing domain:", domain, "\n") - - domain_data <- long_data_clean[long_data_clean$DOMAIN == domain, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = TIME, - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain, ":\n") - print(domain_anova) - - domain_results[[domain]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = TIME, - between = GROUP, - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Time-specific analyses -cat("\n4.3 Time-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -time_results <- list() - -for (time in levels(long_data_clean$TIME)) { - cat("\nAnalyzing time:", time, "\n") - - time_data <- long_data_clean[long_data_clean$TIME == time, ] - - tryCatch({ - time_anova <- ezANOVA( - data = time_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = DOMAIN, - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", time, ":\n") - print(time_anova) - - time_results[[time]] <- time_anova - - }, error = function(e) { - cat("Error in ANOVA for", time, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = time_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = DOMAIN, - between = GROUP, - type = 3, - detailed = TRUE - ) - print(simple_anova) - time_results[[time]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# ============================================================================= -# STEP 5: POST-HOC ANALYSES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 5: POST-HOC ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 5.1 Pairwise comparisons for significant effects -if (exists("main_anova") && !is.null(main_anova)) { - cat("\n5.1 Post-hoc comparisons for main effects:\n") - - # Check for significant main effects and interactions - anova_table <- main_anova$ANOVA - - if ("TIME" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "TIME"] < 0.05) { - cat("Significant TIME main effect found. Computing pairwise comparisons...\n") - - # Simple paired t-tests for TIME effect - past_means <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - future_means <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - if (length(past_means) == length(future_means)) { - time_t_test <- t.test(past_means, future_means, paired = TRUE) - cat("Paired t-test for TIME effect:\n") - cat("t =", round(time_t_test$statistic, 5), - ", df =", time_t_test$parameter, - ", p =", round(time_t_test$p.value, 5), "\n") - cat("Mean difference (Past - Future):", round(time_t_test$estimate, 5), "\n") - } - } - - if ("DOMAIN" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "DOMAIN"] < 0.05) { - cat("Significant DOMAIN main effect found.\n") - - # Pairwise comparisons between domains - domain_means <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise(mean_diff = mean(MEAN_DIFFERENCE), .groups = 'drop') - - cat("Domain means:\n") - print(domain_means) - } - - if ("TIME:DOMAIN" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "TIME:DOMAIN"] < 0.05) { - cat("Significant TIME × DOMAIN interaction found.\n") - - # Simple effects analysis - interaction_means <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise(mean_diff = mean(MEAN_DIFFERENCE), .groups = 'drop') - - cat("TIME × DOMAIN interaction means:\n") - print(interaction_means) - } -} - -# ============================================================================= -# STEP 6: EFFECT SIZES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 6: EFFECT SIZES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -if (exists("main_anova") && !is.null(main_anova)) { - anova_table <- main_anova$ANOVA - - # Calculate partial eta squared for each effect - anova_table$partial_eta_squared <- round(anova_table$SSn / (anova_table$SSn + anova_table$SSd), 5) - - cat("Effect sizes (partial eta squared):\n") - effect_sizes <- anova_table[, c("Effect", "partial_eta_squared")] - print(effect_sizes) -} - -# ============================================================================= -# STEP 7: SUMMARY AND INTERPRETATION -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 7: SUMMARY AND INTERPRETATION\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -cat("Analysis Summary:\n") -cat("- Total participants:", length(unique(long_data_clean$pID)), "\n") -cat("- Total observations:", nrow(long_data_clean), "\n") -cat("- Within-subjects factors: TIME (Past vs Future), DOMAIN (Preferences, Personality, Values, Life)\n") -cat("- Between-subjects factors: GROUP, TEMPORAL_DO, ITEM_DO\n") -cat("- Dependent variable: Mean absolute differences in domain ratings\n") - -cat("\nResearch Question:\n") -cat("Do participants rate changes in domains differently from past to now vs past to future?\n") - -if (exists("main_anova") && !is.null(main_anova)) { - anova_table <- main_anova$ANOVA - - cat("\nKey Findings:\n") - - # Check for significant effects - significant_effects <- anova_table$Effect[anova_table$p < 0.05] - - if (length(significant_effects) > 0) { - cat("Significant effects found:\n") - for (effect in significant_effects) { - p_val <- anova_table$p[anova_table$Effect == effect] - cat("-", effect, "(p =", round(p_val, 5), ")\n") - } - } else { - cat("No significant effects found at α = 0.05\n") - } - - # Interpret TIME effect - if ("TIME" %in% anova_table$Effect) { - time_p <- anova_table$p[anova_table$Effect == "TIME"] - if (time_p < 0.05) { - cat("\nTIME Effect: Participants show different levels of change when comparing\n") - cat("past-to-now vs past-to-future perspectives (p =", round(time_p, 5), ")\n") - } else { - cat("\nTIME Effect: No significant difference between past-to-now and past-to-future\n") - cat("perspectives (p =", round(time_p, 5), ")\n") - } - } - - # Interpret DOMAIN effect - if ("DOMAIN" %in% anova_table$Effect) { - domain_p <- anova_table$p[anova_table$Effect == "DOMAIN"] - if (domain_p < 0.05) { - cat("\nDOMAIN Effect: Different domains show different levels of perceived change\n") - cat("(p =", round(domain_p, 5), ")\n") - } else { - cat("\nDOMAIN Effect: No significant differences between domains in perceived change\n") - cat("(p =", round(domain_p, 5), ")\n") - } - } - - # Interpret interaction - if ("TIME:DOMAIN" %in% anova_table$Effect) { - interaction_p <- anova_table$p[anova_table$Effect == "TIME:DOMAIN"] - if (interaction_p < 0.05) { - cat("\nTIME × DOMAIN Interaction: The effect of time perspective on perceived change\n") - cat("varies across domains (p =", round(interaction_p, 5), ")\n") - } else { - cat("\nTIME × DOMAIN Interaction: No significant interaction between time perspective\n") - cat("and domain (p =", round(interaction_p, 5), ")\n") - } - } -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") diff --git a/.history/mixed anova - domain means_20250912125046.r b/.history/mixed anova - domain means_20250912125046.r deleted file mode 100644 index 8f5ec66..0000000 --- a/.history/mixed anova - domain means_20250912125046.r +++ /dev/null @@ -1,584 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -cat("\nChecking available domain mean columns:\n") -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - cat("Warning: Missing variables:", paste(missing_vars, collapse = ", "), "\n") -} else { - cat("All required domain mean variables found!\n") -} - -# ============================================================================= -# STEP 1: DATA PIVOTING TO LONG FORMAT -# ============================================================================= - -cat("STEP 1: DATA PIVOTING TO LONG FORMAT\n") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -cat("Domain mapping:\n") -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - cat("Warning: Variable", var_name, "not found in data\n") - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "GROUP", "TEMPORAL_DO", "ITEM_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$GROUP <- as.factor(long_data$GROUP) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - long_data$ITEM_DO <- as.factor(long_data$ITEM_DO) - - return(long_data) -} - -# Pivot data to long format -cat("\nPivoting data to long format...\n") -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) - cat("Data pivoting completed successfully.\n") -}, error = function(e) { - cat("Error in data pivoting:", e$message, "\n") - stop("Cannot proceed without proper data structure") -}) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique participants:", length(unique(long_data$pID)), "\n") -cat("TIME levels:", paste(levels(long_data$TIME), collapse = ", "), "\n") -cat("DOMAIN levels:", paste(levels(long_data$DOMAIN), collapse = ", "), "\n") - -# Check data types -cat("\nData types check:\n") -cat("TIME is factor:", is.factor(long_data$TIME), "\n") -cat("DOMAIN is factor:", is.factor(long_data$DOMAIN), "\n") -cat("pID is factor:", is.factor(long_data$pID), "\n") -cat("MEAN_DIFFERENCE is numeric:", is.numeric(long_data$MEAN_DIFFERENCE), "\n") - -# Show first 20 rows -cat("\nFirst 20 rows of long_data:\n") -print(utils::head(long_data, 20)) - -# Display structure and sample -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(utils::head(long_data, 10)) - -# Show example data for one participant -cat("\nExample: Participant 1 across all domains and times:\n") -participant_1_data <- long_data[long_data$pID == 1, c("pID", "GROUP", "TEMPORAL_DO", "ITEM_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# STEP 2: ASSUMPTION CHECKING -# ============================================================================= - - -# 2.1 Check for missing values -cat("\n2.1 Missing Values Check:\n") -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("Missing values by TIME and DOMAIN:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\n2.2 Outlier Detection:\n") -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\n2.3 Normality Tests:\n") -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\n2.4 Homogeneity of Variance Tests:\n") - -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance across TIME within each DOMAIN:\n") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance across DOMAIN within each TIME:\n") -print(homogeneity_domain) - -# ============================================================================= -# STEP 3: DESCRIPTIVE STATISTICS -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# Overall descriptive statistics -desc_stats <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE), 5), - sd = round(sd(MEAN_DIFFERENCE), 5), - median = round(median(MEAN_DIFFERENCE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75), 5), - min = round(min(MEAN_DIFFERENCE), 5), - max = round(max(MEAN_DIFFERENCE), 5), - .groups = 'drop' - ) - -cat("Descriptive statistics by TIME and DOMAIN:\n") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_group <- long_data_clean %>% - group_by(GROUP, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE), 5), - sd = round(sd(MEAN_DIFFERENCE), 5), - .groups = 'drop' - ) - -cat("\nDescriptive statistics by GROUP, TIME, and DOMAIN:\n") -print(desc_stats_by_group) - -# ============================================================================= -# STEP 4: MIXED ANOVA ANALYSES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Main Mixed ANOVA -cat("\n4.1 Main Mixed ANOVA:\n") -cat("Within-subjects factors: TIME, DOMAIN\n") -cat("Between-subjects factors: GROUP, TEMPORAL_DO, ITEM_DO\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - main_anova <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = c(TIME, DOMAIN), - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Main ANOVA Results:\n") - print(main_anova) - - # Check sphericity - if (!is.null(main_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(main_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in main ANOVA:", e$message, "\n") - - # Try simpler model without all between-subjects factors - cat("Attempting simpler model with only GROUP as between-subjects factor...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = long_data_clean, - dv = MEAN_DIFFERENCE, - wid = pID, - within = c(TIME, DOMAIN), - between = GROUP, - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Simplified ANOVA Results:\n") - print(simple_anova) - - main_anova <<- simple_anova - - }, error = function(e2) { - cat("Simplified ANOVA also failed:", e2$message, "\n") - }) -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain in levels(long_data_clean$DOMAIN)) { - cat("\nAnalyzing domain:", domain, "\n") - - domain_data <- long_data_clean[long_data_clean$DOMAIN == domain, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = TIME, - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain, ":\n") - print(domain_anova) - - domain_results[[domain]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = TIME, - between = GROUP, - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Time-specific analyses -cat("\n4.3 Time-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -time_results <- list() - -for (time in levels(long_data_clean$TIME)) { - cat("\nAnalyzing time:", time, "\n") - - time_data <- long_data_clean[long_data_clean$TIME == time, ] - - tryCatch({ - time_anova <- ezANOVA( - data = time_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = DOMAIN, - between = c(GROUP, TEMPORAL_DO, ITEM_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", time, ":\n") - print(time_anova) - - time_results[[time]] <- time_anova - - }, error = function(e) { - cat("Error in ANOVA for", time, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = time_data, - dv = MEAN_DIFFERENCE, - wid = pID, - within = DOMAIN, - between = GROUP, - type = 3, - detailed = TRUE - ) - print(simple_anova) - time_results[[time]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# ============================================================================= -# STEP 5: POST-HOC ANALYSES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 5: POST-HOC ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 5.1 Pairwise comparisons for significant effects -if (exists("main_anova") && !is.null(main_anova)) { - cat("\n5.1 Post-hoc comparisons for main effects:\n") - - # Check for significant main effects and interactions - anova_table <- main_anova$ANOVA - - if ("TIME" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "TIME"] < 0.05) { - cat("Significant TIME main effect found. Computing pairwise comparisons...\n") - - # Simple paired t-tests for TIME effect - past_means <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Past"] - future_means <- long_data_clean$MEAN_DIFFERENCE[long_data_clean$TIME == "Future"] - - if (length(past_means) == length(future_means)) { - time_t_test <- t.test(past_means, future_means, paired = TRUE) - cat("Paired t-test for TIME effect:\n") - cat("t =", round(time_t_test$statistic, 5), - ", df =", time_t_test$parameter, - ", p =", round(time_t_test$p.value, 5), "\n") - cat("Mean difference (Past - Future):", round(time_t_test$estimate, 5), "\n") - } - } - - if ("DOMAIN" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "DOMAIN"] < 0.05) { - cat("Significant DOMAIN main effect found.\n") - - # Pairwise comparisons between domains - domain_means <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise(mean_diff = mean(MEAN_DIFFERENCE), .groups = 'drop') - - cat("Domain means:\n") - print(domain_means) - } - - if ("TIME:DOMAIN" %in% anova_table$Effect && anova_table$p[anova_table$Effect == "TIME:DOMAIN"] < 0.05) { - cat("Significant TIME × DOMAIN interaction found.\n") - - # Simple effects analysis - interaction_means <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise(mean_diff = mean(MEAN_DIFFERENCE), .groups = 'drop') - - cat("TIME × DOMAIN interaction means:\n") - print(interaction_means) - } -} - -# ============================================================================= -# STEP 6: EFFECT SIZES -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 6: EFFECT SIZES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -if (exists("main_anova") && !is.null(main_anova)) { - anova_table <- main_anova$ANOVA - - # Calculate partial eta squared for each effect - anova_table$partial_eta_squared <- round(anova_table$SSn / (anova_table$SSn + anova_table$SSd), 5) - - cat("Effect sizes (partial eta squared):\n") - effect_sizes <- anova_table[, c("Effect", "partial_eta_squared")] - print(effect_sizes) -} - -# ============================================================================= -# STEP 7: SUMMARY AND INTERPRETATION -# ============================================================================= - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 7: SUMMARY AND INTERPRETATION\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -cat("Analysis Summary:\n") -cat("- Total participants:", length(unique(long_data_clean$pID)), "\n") -cat("- Total observations:", nrow(long_data_clean), "\n") -cat("- Within-subjects factors: TIME (Past vs Future), DOMAIN (Preferences, Personality, Values, Life)\n") -cat("- Between-subjects factors: GROUP, TEMPORAL_DO, ITEM_DO\n") -cat("- Dependent variable: Mean absolute differences in domain ratings\n") - -cat("\nResearch Question:\n") -cat("Do participants rate changes in domains differently from past to now vs past to future?\n") - -if (exists("main_anova") && !is.null(main_anova)) { - anova_table <- main_anova$ANOVA - - cat("\nKey Findings:\n") - - # Check for significant effects - significant_effects <- anova_table$Effect[anova_table$p < 0.05] - - if (length(significant_effects) > 0) { - cat("Significant effects found:\n") - for (effect in significant_effects) { - p_val <- anova_table$p[anova_table$Effect == effect] - cat("-", effect, "(p =", round(p_val, 5), ")\n") - } - } else { - cat("No significant effects found at α = 0.05\n") - } - - # Interpret TIME effect - if ("TIME" %in% anova_table$Effect) { - time_p <- anova_table$p[anova_table$Effect == "TIME"] - if (time_p < 0.05) { - cat("\nTIME Effect: Participants show different levels of change when comparing\n") - cat("past-to-now vs past-to-future perspectives (p =", round(time_p, 5), ")\n") - } else { - cat("\nTIME Effect: No significant difference between past-to-now and past-to-future\n") - cat("perspectives (p =", round(time_p, 5), ")\n") - } - } - - # Interpret DOMAIN effect - if ("DOMAIN" %in% anova_table$Effect) { - domain_p <- anova_table$p[anova_table$Effect == "DOMAIN"] - if (domain_p < 0.05) { - cat("\nDOMAIN Effect: Different domains show different levels of perceived change\n") - cat("(p =", round(domain_p, 5), ")\n") - } else { - cat("\nDOMAIN Effect: No significant differences between domains in perceived change\n") - cat("(p =", round(domain_p, 5), ")\n") - } - } - - # Interpret interaction - if ("TIME:DOMAIN" %in% anova_table$Effect) { - interaction_p <- anova_table$p[anova_table$Effect == "TIME:DOMAIN"] - if (interaction_p < 0.05) { - cat("\nTIME × DOMAIN Interaction: The effect of time perspective on perceived change\n") - cat("varies across domains (p =", round(interaction_p, 5), ")\n") - } else { - cat("\nTIME × DOMAIN Interaction: No significant interaction between time perspective\n") - cat("and domain (p =", round(interaction_p, 5), ")\n") - } - } -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") diff --git a/.history/mixed anova - domain means_20250912130804.r b/.history/mixed anova - domain means_20250912130804.r deleted file mode 100644 index dbdffd0..0000000 --- a/.history/mixed anova - domain means_20250912130804.r +++ /dev/null @@ -1,131 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# ============================================================================= -# STEP 1: DATA PIVOTING TO LONG FORMAT -# ============================================================================= - -cat("STEP 1: DATA PIVOTING TO LONG FORMAT\n") - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -cat("Domain mapping:\n") -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - cat("Warning: Variable", var_name, "not found in data\n") - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "GROUP", "TEMPORAL_DO", "ITEM_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$GROUP <- as.factor(long_data$GROUP) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - long_data$ITEM_DO <- as.factor(long_data$ITEM_DO) - - return(long_data) -} - -# Pivot data to long format -cat("\nPivoting data to long format...\n") -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) - cat("Data pivoting completed successfully.\n") -}, error = function(e) { - cat("Error in data pivoting:", e$message, "\n") - stop("Cannot proceed without proper data structure") -}) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique participants:", length(unique(long_data$pID)), "\n") -cat("TIME levels:", paste(levels(long_data$TIME), collapse = ", "), "\n") -cat("DOMAIN levels:", paste(levels(long_data$DOMAIN), collapse = ", "), "\n") - -# Check data types -cat("\nData types check:\n") -cat("TIME is factor:", is.factor(long_data$TIME), "\n") -cat("DOMAIN is factor:", is.factor(long_data$DOMAIN), "\n") -cat("pID is factor:", is.factor(long_data$pID), "\n") -cat("MEAN_DIFFERENCE is numeric:", is.numeric(long_data$MEAN_DIFFERENCE), "\n") - -# Show first 20 rows -cat("\nFirst 20 rows of long_data:\n") -print(utils::head(long_data, 20)) - -# Display structure and sample -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(utils::head(long_data, 10)) - -# Show example data for one participant -cat("\nExample: Participant 1 across all domains and times:\n") -participant_1_data <- long_data[long_data$pID == 1, c("pID", "GROUP", "TEMPORAL_DO", "ITEM_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - diff --git a/.history/mixed anova - domain means_20250912130809.r b/.history/mixed anova - domain means_20250912130809.r deleted file mode 100644 index 2424ed0..0000000 --- a/.history/mixed anova - domain means_20250912130809.r +++ /dev/null @@ -1,124 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - cat("Warning: Variable", var_name, "not found in data\n") - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "GROUP", "TEMPORAL_DO", "ITEM_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$GROUP <- as.factor(long_data$GROUP) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - long_data$ITEM_DO <- as.factor(long_data$ITEM_DO) - - return(long_data) -} - -# Pivot data to long format -cat("\nPivoting data to long format...\n") -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) - cat("Data pivoting completed successfully.\n") -}, error = function(e) { - cat("Error in data pivoting:", e$message, "\n") - stop("Cannot proceed without proper data structure") -}) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique participants:", length(unique(long_data$pID)), "\n") -cat("TIME levels:", paste(levels(long_data$TIME), collapse = ", "), "\n") -cat("DOMAIN levels:", paste(levels(long_data$DOMAIN), collapse = ", "), "\n") - -# Check data types -cat("\nData types check:\n") -cat("TIME is factor:", is.factor(long_data$TIME), "\n") -cat("DOMAIN is factor:", is.factor(long_data$DOMAIN), "\n") -cat("pID is factor:", is.factor(long_data$pID), "\n") -cat("MEAN_DIFFERENCE is numeric:", is.numeric(long_data$MEAN_DIFFERENCE), "\n") - -# Show first 20 rows -cat("\nFirst 20 rows of long_data:\n") -print(utils::head(long_data, 20)) - -# Display structure and sample -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(utils::head(long_data, 10)) - -# Show example data for one participant -cat("\nExample: Participant 1 across all domains and times:\n") -participant_1_data <- long_data[long_data$pID == 1, c("pID", "GROUP", "TEMPORAL_DO", "ITEM_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - diff --git a/.history/mixed anova - domain means_20250912130812.r b/.history/mixed anova - domain means_20250912130812.r deleted file mode 100644 index da4e922..0000000 --- a/.history/mixed anova - domain means_20250912130812.r +++ /dev/null @@ -1,124 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "GROUP", "TEMPORAL_DO", "ITEM_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$GROUP <- as.factor(long_data$GROUP) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - long_data$ITEM_DO <- as.factor(long_data$ITEM_DO) - - return(long_data) -} - -# Pivot data to long format -cat("\nPivoting data to long format...\n") -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) - cat("Data pivoting completed successfully.\n") -}, error = function(e) { - cat("Error in data pivoting:", e$message, "\n") - stop("Cannot proceed without proper data structure") -}) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique participants:", length(unique(long_data$pID)), "\n") -cat("TIME levels:", paste(levels(long_data$TIME), collapse = ", "), "\n") -cat("DOMAIN levels:", paste(levels(long_data$DOMAIN), collapse = ", "), "\n") - -# Check data types -cat("\nData types check:\n") -cat("TIME is factor:", is.factor(long_data$TIME), "\n") -cat("DOMAIN is factor:", is.factor(long_data$DOMAIN), "\n") -cat("pID is factor:", is.factor(long_data$pID), "\n") -cat("MEAN_DIFFERENCE is numeric:", is.numeric(long_data$MEAN_DIFFERENCE), "\n") - -# Show first 20 rows -cat("\nFirst 20 rows of long_data:\n") -print(utils::head(long_data, 20)) - -# Display structure and sample -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(utils::head(long_data, 10)) - -# Show example data for one participant -cat("\nExample: Participant 1 across all domains and times:\n") -participant_1_data <- long_data[long_data$pID == 1, c("pID", "GROUP", "TEMPORAL_DO", "ITEM_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - diff --git a/.history/mixed anova - domain means_20250912130822.r b/.history/mixed anova - domain means_20250912130822.r deleted file mode 100644 index 6cc89f1..0000000 --- a/.history/mixed anova - domain means_20250912130822.r +++ /dev/null @@ -1,117 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "GROUP", "TEMPORAL_DO", "ITEM_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$GROUP <- as.factor(long_data$GROUP) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - long_data$ITEM_DO <- as.factor(long_data$ITEM_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "GROUP", "TEMPORAL_DO", "ITEM_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - diff --git a/.history/mixed anova - domain means_20250912130828.r b/.history/mixed anova - domain means_20250912130828.r deleted file mode 100644 index 6cc89f1..0000000 --- a/.history/mixed anova - domain means_20250912130828.r +++ /dev/null @@ -1,117 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "GROUP", "TEMPORAL_DO", "ITEM_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$GROUP <- as.factor(long_data$GROUP) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - long_data$ITEM_DO <- as.factor(long_data$ITEM_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "GROUP", "TEMPORAL_DO", "ITEM_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - diff --git a/.history/mixed anova - domain means_20250912130829.r b/.history/mixed anova - domain means_20250912130829.r deleted file mode 100644 index 6cc89f1..0000000 --- a/.history/mixed anova - domain means_20250912130829.r +++ /dev/null @@ -1,117 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "GROUP", "TEMPORAL_DO", "ITEM_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$GROUP <- as.factor(long_data$GROUP) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - long_data$ITEM_DO <- as.factor(long_data$ITEM_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "GROUP", "TEMPORAL_DO", "ITEM_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - diff --git a/.history/mixed anova - domain means_20250912144754.r b/.history/mixed anova - domain means_20250912144754.r deleted file mode 100644 index 49e89a3..0000000 --- a/.history/mixed anova - domain means_20250912144754.r +++ /dev/null @@ -1,115 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - diff --git a/.history/mixed anova - domain means_20250912152948.r b/.history/mixed anova - domain means_20250912152948.r deleted file mode 100644 index 36c68c0..0000000 --- a/.history/mixed anova - domain means_20250912152948.r +++ /dev/null @@ -1,272 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_group <- long_data %>% - group_by(GROUP, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by GROUP, TIME, and DOMAIN:") -print(desc_stats_by_group) - -# Overall means by TIME (collapsed across domains) -time_means <- long_data %>% - group_by(TIME) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by TIME (collapsed across domains):") -print(time_means) - -# Overall means by DOMAIN (collapsed across time) -domain_means <- long_data %>% - group_by(DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by DOMAIN (collapsed across time):") -print(domain_means) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -print("Normality test results:") -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/mixed anova - domain means_20250912152953.r b/.history/mixed anova - domain means_20250912152953.r deleted file mode 100644 index 36c68c0..0000000 --- a/.history/mixed anova - domain means_20250912152953.r +++ /dev/null @@ -1,272 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_group <- long_data %>% - group_by(GROUP, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by GROUP, TIME, and DOMAIN:") -print(desc_stats_by_group) - -# Overall means by TIME (collapsed across domains) -time_means <- long_data %>% - group_by(TIME) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by TIME (collapsed across domains):") -print(time_means) - -# Overall means by DOMAIN (collapsed across time) -domain_means <- long_data %>% - group_by(DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by DOMAIN (collapsed across time):") -print(domain_means) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -print("Normality test results:") -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/mixed anova - domain means_20250912153103.r b/.history/mixed anova - domain means_20250912153103.r deleted file mode 100644 index 36c68c0..0000000 --- a/.history/mixed anova - domain means_20250912153103.r +++ /dev/null @@ -1,272 +0,0 @@ -# Mixed ANOVA Analysis for Domain Means -# EOHI Experiment Data Analysis - Domain Level Analysis -# Variables: NPast_mean_pref, NPast_mean_pers, NPast_mean_val, NPast_mean_life -# NFut_mean_pref, NFut_mean_pers, NFut_mean_val, NFut_mean_life - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests -library(ggplot2) # For plotting -library(emmeans) # For post-hoc comparisons - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -print(dim(data)) -print(length(unique(data$pID))) - -# Check experimental conditions -print(table(data$GROUP, data$TEMPORAL_DO, data$ITEM_DO)) - -# Check what domain mean columns are available -domain_mean_cols <- colnames(data)[grepl("mean_(pref|pers|val|life)", colnames(data))] -print(domain_mean_cols) - -# Verify the specific variables we need -required_vars <- c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life") - -missing_vars <- required_vars[!required_vars %in% colnames(data)] -if (length(missing_vars) > 0) { - print(paste("Warning: Missing variables:", paste(missing_vars, collapse = ", "))) -} else { - print("All required domain mean variables found!") -} - -# Define domain mapping -domain_mapping <- data.frame( - variable = c("NPast_mean_pref", "NPast_mean_pers", "NPast_mean_val", "NPast_mean_life", - "NFut_mean_pref", "NFut_mean_pers", "NFut_mean_val", "NFut_mean_life"), - time = c(rep("Past", 4), rep("Future", 4)), - domain = rep(c("Preferences", "Personality", "Values", "Life"), 2), - stringsAsFactors = FALSE -) - -print(domain_mapping) - -# Function to pivot data to long format -pivot_domain_means <- function(data, domain_mapping) { - long_data <- data.frame() - - for (i in 1:nrow(domain_mapping)) { - var_name <- domain_mapping$variable[i] - time_level <- domain_mapping$time[i] - domain_level <- domain_mapping$domain[i] - - # Check if variable exists - if (!var_name %in% colnames(data)) { - print(paste("Warning: Variable", var_name, "not found in data")) - next - } - - # Create subset for this variable - subset_data <- data[, c("pID", "ResponseId", "TEMPORAL_DO", var_name)] - subset_data$TIME <- time_level - subset_data$DOMAIN <- domain_level - subset_data$MEAN_DIFFERENCE <- subset_data[[var_name]] - subset_data[[var_name]] <- NULL # Remove original column - - # Add to long data - long_data <- rbind(long_data, subset_data) - } - - # Convert to factors with proper levels - long_data$TIME <- factor(long_data$TIME, levels = c("Past", "Future")) - long_data$DOMAIN <- factor(long_data$DOMAIN, levels = c("Preferences", "Personality", "Values", "Life")) - long_data$pID <- as.factor(long_data$pID) - long_data$TEMPORAL_DO <- as.factor(long_data$TEMPORAL_DO) - - return(long_data) -} - -# Pivot data to long format -tryCatch({ - long_data <- pivot_domain_means(data, domain_mapping) -}, error = function(e) { - print(paste("Error in data pivoting:", e$message)) - stop("Cannot proceed without proper data structure") -}) - -print(dim(long_data)) -print(length(unique(long_data$pID))) -print(levels(long_data$TIME)) -print(levels(long_data$DOMAIN)) - -# Check data types -print(is.factor(long_data$TIME)) -print(is.factor(long_data$DOMAIN)) -print(is.factor(long_data$pID)) -print(is.numeric(long_data$MEAN_DIFFERENCE)) - -# Show first 20 rows -print(utils::head(long_data, 20)) - -# Display structure and sample -str(long_data) - -print(utils::head(long_data, 10)) - -# Show example data for one participant -participant_1_data <- long_data[long_data$pID == 1, c("pID", "TEMPORAL_DO", "TIME", "DOMAIN", "MEAN_DIFFERENCE")] -print(participant_1_data) - -# ============================================================================= -# DESCRIPTIVE STATISTICS -# ============================================================================= - -# Overall descriptive statistics by TIME and DOMAIN -desc_stats <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - median = round(median(MEAN_DIFFERENCE, na.rm = TRUE), 5), - q1 = round(quantile(MEAN_DIFFERENCE, 0.25, na.rm = TRUE), 5), - q3 = round(quantile(MEAN_DIFFERENCE, 0.75, na.rm = TRUE), 5), - min = round(min(MEAN_DIFFERENCE, na.rm = TRUE), 5), - max = round(max(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by TIME and DOMAIN:") -print(desc_stats) - -# Descriptive statistics by between-subjects factors -desc_stats_by_group <- long_data %>% - group_by(GROUP, TIME, DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Descriptive statistics by GROUP, TIME, and DOMAIN:") -print(desc_stats_by_group) - -# Overall means by TIME (collapsed across domains) -time_means <- long_data %>% - group_by(TIME) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by TIME (collapsed across domains):") -print(time_means) - -# Overall means by DOMAIN (collapsed across time) -domain_means <- long_data %>% - group_by(DOMAIN) %>% - summarise( - n = n(), - mean = round(mean(MEAN_DIFFERENCE, na.rm = TRUE), 5), - variance = round(var(MEAN_DIFFERENCE, na.rm = TRUE), 5), - sd = round(sd(MEAN_DIFFERENCE, na.rm = TRUE), 5), - .groups = 'drop' - ) - -print("Overall means by DOMAIN (collapsed across time):") -print(domain_means) - -# ============================================================================= -# ASSUMPTION TESTING -# ============================================================================= - -# Remove missing values for assumption testing -long_data_clean <- long_data[!is.na(long_data$MEAN_DIFFERENCE), ] -print(paste("Data after removing missing values:", dim(long_data_clean))) - -# 1. Missing values check -missing_summary <- long_data %>% - group_by(TIME, DOMAIN) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(MEAN_DIFFERENCE)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -print("Missing values by TIME and DOMAIN:") -print(missing_summary) - -# 2. Outlier detection -outlier_summary <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - mean = mean(MEAN_DIFFERENCE), - sd = sd(MEAN_DIFFERENCE), - q1 = quantile(MEAN_DIFFERENCE, 0.25), - q3 = quantile(quantile(MEAN_DIFFERENCE, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(MEAN_DIFFERENCE < lower_bound | MEAN_DIFFERENCE > upper_bound), - .groups = 'drop' - ) - -print("Outlier summary (IQR method):") -print(outlier_summary) - -# 3. Normality tests -normality_results <- long_data_clean %>% - group_by(TIME, DOMAIN) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(MEAN_DIFFERENCE)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(MEAN_DIFFERENCE)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -print("Normality test results:") -print(normality_results) - -# 4. Homogeneity of variance (Levene's test) -# Test homogeneity across TIME within each DOMAIN -homogeneity_time <- long_data_clean %>% - group_by(DOMAIN) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ TIME)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across TIME within each DOMAIN:") -print(homogeneity_time) - -# Test homogeneity across DOMAIN within each TIME -homogeneity_domain <- long_data_clean %>% - group_by(TIME) %>% - summarise( - levene_p = leveneTest(MEAN_DIFFERENCE ~ DOMAIN)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -print("Homogeneity of variance across DOMAIN within each TIME:") -print(homogeneity_domain) - diff --git a/.history/mixed anova - ind item_20250912123133.r b/.history/mixed anova - ind item_20250912123133.r deleted file mode 100644 index e69de29..0000000 diff --git a/.history/mixed anova - ind item_20250912123134.r b/.history/mixed anova - ind item_20250912123134.r deleted file mode 100644 index 25b836c..0000000 --- a/.history/mixed anova - ind item_20250912123134.r +++ /dev/null @@ -1,397 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# Check what columns are actually available -npast_cols <- colnames(data)[grepl("^NPast", colnames(data))] -nfut_cols <- colnames(data)[grepl("^NFut", colnames(data))] - -cat("NPast columns found:\n") -print(npast_cols) - -cat("\nNFut columns found:\n") -print(nfut_cols) - -cat("\nTotal NPast columns:", length(npast_cols), "\n") -cat("Total NFut columns:", length(nfut_cols), "\n") - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - using base R approach to avoid issues - # Past data - past_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", past_col)] - past_data$TimePerspective <- "Past" - past_data$Difference <- past_data[[past_col]] - past_data$Domain_Type <- domain_type - past_data$Domain_Item <- domain_name - past_data[[past_col]] <- NULL # Remove the original column - - # Future data - fut_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", fut_col)] - fut_data$TimePerspective <- "Future" - fut_data$Difference <- fut_data[[fut_col]] - fut_data$Domain_Type <- domain_type - fut_data$Domain_Item <- domain_name - fut_data[[fut_col]] <- NULL # Remove the original column - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors with proper levels - all_long_data$TimePerspective <- factor(all_long_data$TimePerspective, levels = c("Past", "Future")) - all_long_data$Domain_Type <- factor(all_long_data$Domain_Type, levels = c("Preferences", "Personality", "Values", "Life_Satisfaction")) - all_long_data$Domain_Item <- factor(all_long_data$Domain_Item, levels = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change")) - all_long_data$pID <- as.factor(all_long_data$pID) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# Display structure and sample of long_data -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -cat("\nColumn names:\n") -print(colnames(long_data)) - -# Show factor levels for domain variables -cat("\nDomain_Type factor levels:\n") -print(levels(long_data$Domain_Type)) - -cat("\nDomain_Item factor levels:\n") -print(levels(long_data$Domain_Item)) - -cat("\nTimePerspective factor levels:\n") -print(levels(long_data$TimePerspective)) - -# Show a sample with actual names instead of numbers -cat("\nSample data with actual names (first 6 rows):\n") -sample_data <- long_data[1:6, c("pID", "GROUP", "TASK_DO", "TEMPORAL_DO", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(sample_data) - -# Show a better example - one participant across multiple domains -cat("\nExample: Participant 1 across multiple domains (first 10 rows):\n") -participant_1_data <- long_data[long_data$pID == 1, c("pID", "GROUP", "TASK_DO", "TEMPORAL_DO", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(participant_1_data) - -# Show structure explanation -cat("\nLong format explanation:\n") -cat("- Each participant appears", length(unique(long_data$Domain_Item)) * 2, "times total\n") -cat("- (", length(unique(long_data$Domain_Item)), "domains × 2 time perspectives)\n") -cat("- Total rows per participant:", length(unique(long_data$Domain_Item)) * 2, "\n") -cat("- Total participants:", length(unique(long_data$pID)), "\n") -cat("- Expected total rows:", length(unique(long_data$pID)) * length(unique(long_data$Domain_Item)) * 2, "\n") -cat("- Actual total rows:", nrow(long_data), "\n") - -# STEP 2: ASSUMPTION CHECKING - -cat("STEP 2: CHECKING ASSUMPTIONS\n") - -head(long_data) -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912110917.r b/.history/mixed anova_20250912110917.r deleted file mode 100644 index e194b3a..0000000 --- a/.history/mixed anova_20250912110917.r +++ /dev/null @@ -1,283 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(afex) -library(emmeans) -library(ggplot2) -library(car) - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# Prepare data for mixed ANOVA -# We'll reshape the data to have Past and Future as separate rows for each participant - -# Create a function to reshape data for a specific domain -reshape_domain_data <- function(data, domain_name) { - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - return(NULL) - } - - # Create long format data - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate(TimePerspective = "Past", - Difference = .data[[past_col]]) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate(TimePerspective = "Future", - Difference = .data[[fut_col]]) %>% - select(-all_of(fut_col)) - - combined_data <- rbind(past_data, fut_data) %>% - mutate(TimePerspective = as.factor(TimePerspective), - pID = as.factor(pID)) - - return(combined_data) -} - -# Define domains to analyze -domains <- c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change") - -# Run mixed ANOVA for each domain -results_list <- list() - -for (domain in domains) { - cat("\n", "="*60, "\n") - cat("ANALYZING DOMAIN:", toupper(domain), "\n") - cat("="*60, "\n") - - # Reshape data for this domain - domain_data <- reshape_domain_data(data, domain) - - if (is.null(domain_data)) { - next - } - - # Check for missing values - missing_count <- sum(is.na(domain_data$Difference)) - if (missing_count > 0) { - cat("Warning:", missing_count, "missing values found for", domain, "\n") - domain_data <- domain_data[!is.na(domain_data$Difference), ] - } - - # Descriptive statistics - cat("\nDescriptive Statistics:\n") - desc_stats <- domain_data %>% - group_by(TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference, na.rm = TRUE), - sd = sd(Difference, na.rm = TRUE), - median = median(Difference, na.rm = TRUE), - .groups = 'drop' - ) - print(desc_stats) - - # Effect size (Cohen's d) - past_diff <- domain_data$Difference[domain_data$TimePerspective == "Past"] - fut_diff <- domain_data$Difference[domain_data$TimePerspective == "Future"] - - pooled_sd <- sqrt(((length(past_diff) - 1) * var(past_diff) + - (length(fut_diff) - 1) * var(fut_diff)) / - (length(past_diff) + length(fut_diff) - 2)) - - cohens_d <- (mean(past_diff) - mean(fut_diff)) / pooled_sd - - cat("\nCohen's d (Past vs Future):", round(cohens_d, 5), "\n") - - # Mixed ANOVA using ezANOVA - tryCatch({ - # Simple repeated measures ANOVA (TimePerspective as within-subjects) - anova_result <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("\nMixed ANOVA Results:\n") - print(anova_result) - - # Store results - results_list[[domain]] <- list( - descriptive = desc_stats, - cohens_d = cohens_d, - anova = anova_result - ) - - # Post-hoc comparisons if significant - if (!is.null(anova_result$ANOVA)) { - if (any(anova_result$ANOVA$p < 0.05, na.rm = TRUE)) { - cat("\nSignificant effects found! Post-hoc analysis:\n") - - # Pairwise comparisons for TimePerspective - if ("TimePerspective" %in% anova_result$ANOVA$Effect && - anova_result$ANOVA$p[anova_result$ANOVA$Effect == "TimePerspective"] < 0.05) { - - # Simple t-test for comparison - t_test_result <- t.test(past_diff, fut_diff, paired = TRUE) - cat("\nPaired t-test (Past vs Future):\n") - cat("t =", round(t_test_result$statistic, 3), - ", df =", t_test_result$parameter, - ", p =", round(t_test_result$p.value, 5), "\n") - } - } - } - - }, error = function(e) { - cat("Error in ANOVA for", domain, ":", e$message, "\n") - - # Fallback to simple paired t-test - t_test_result <- t.test(past_diff, fut_diff, paired = TRUE) - cat("\nFallback: Paired t-test (Past vs Future):\n") - cat("t =", round(t_test_result$statistic, 3), - ", df =", t_test_result$parameter, - ", p =", round(t_test_result$p.value, 5), "\n") - - results_list[[domain]] <- list( - descriptive = desc_stats, - cohens_d = cohens_d, - t_test = t_test_result - ) - }) -} - -# Summary of all results -cat("\n", "="*80, "\n") -cat("SUMMARY OF ALL DOMAINS\n") -cat("="*80, "\n") - -summary_df <- data.frame( - Domain = character(), - Past_Mean = numeric(), - Future_Mean = numeric(), - Cohen_d = numeric(), - Significant = logical(), - stringsAsFactors = FALSE -) - -for (domain in names(results_list)) { - result <- results_list[[domain]] - - past_mean <- result$descriptive$mean[result$descriptive$TimePerspective == "Past"] - fut_mean <- result$descriptive$mean[result$descriptive$TimePerspective == "Future"] - cohens_d <- result$cohens_d - - # Check if significant (p < 0.05) - significant <- FALSE - if (!is.null(result$anova) && !is.null(result$anova$ANOVA)) { - if ("TimePerspective" %in% result$anova$ANOVA$Effect) { - p_val <- result$anova$ANOVA$p[result$anova$ANOVA$Effect == "TimePerspective"] - significant <- !is.na(p_val) && p_val < 0.05 - } - } else if (!is.null(result$t_test)) { - significant <- result$t_test$p.value < 0.05 - } - - summary_df <- rbind(summary_df, data.frame( - Domain = domain, - Past_Mean = round(past_mean, 3), - Future_Mean = round(fut_mean, 3), - Cohen_d = round(cohens_d, 5), - Significant = significant - )) -} - -# Sort by effect size (absolute value) -summary_df <- summary_df[order(abs(summary_df$Cohen_d), decreasing = TRUE), ] - -print(summary_df) - -# Create visualization -library(ggplot2) - -# Prepare data for plotting -plot_data <- summary_df %>% - mutate( - Effect_Size = abs(Cohen_d), - Direction = ifelse(Cohen_d > 0, "Past > Future", "Future > Past"), - Domain_Type = case_when( - grepl("pref_", Domain) ~ "Preferences", - grepl("pers_", Domain) ~ "Personality", - grepl("val_", Domain) ~ "Values", - grepl("life_", Domain) ~ "Life Satisfaction", - TRUE ~ "Other" - ) - ) - -# Effect size plot -p1 <- ggplot(plot_data, aes(x = reorder(Domain, Effect_Size), y = Effect_Size, - fill = Direction, alpha = Significant)) + - geom_col() + - coord_flip() + - scale_alpha_manual(values = c(0.5, 1), name = "Significant\n(p < 0.05)") + - scale_fill_manual(values = c("Past > Future" = "#E74C3C", "Future > Past" = "#3498DB")) + - labs( - title = "Effect Sizes: Past vs Future Differences", - subtitle = "Absolute Cohen's d values across domains", - x = "Domain", - y = "|Cohen's d|", - fill = "Direction" - ) + - theme_minimal() + - theme(axis.text.y = element_text(size = 8)) - -print(p1) - -# Mean differences plot -plot_data_long <- summary_df %>% - select(Domain, Past_Mean, Future_Mean) %>% - pivot_longer(cols = c(Past_Mean, Future_Mean), - names_to = "TimePerspective", - values_to = "Mean_Difference") %>% - mutate(TimePerspective = gsub("_Mean", "", TimePerspective)) - -p2 <- ggplot(plot_data_long, aes(x = reorder(Domain, Mean_Difference), - y = Mean_Difference, - fill = TimePerspective)) + - geom_col(position = "dodge") + - coord_flip() + - scale_fill_manual(values = c("Past" = "#E74C3C", "Future" = "#3498DB")) + - labs( - title = "Mean Differences by Time Perspective", - subtitle = "Past vs Future difference scores", - x = "Domain", - y = "Mean Difference Score", - fill = "Time Perspective" - ) + - theme_minimal() + - theme(axis.text.y = element_text(size = 8)) - -print(p2) - -cat("\nAnalysis complete! Check the plots and summary table above.\n") -cat("Key findings:\n") -cat("- Domains with largest effect sizes:", paste(head(summary_df$Domain, 3), collapse = ", "), "\n") -cat("- Number of significant differences:", sum(summary_df$Significant), "out of", nrow(summary_df), "\n") diff --git a/.history/mixed anova_20250912110922.r b/.history/mixed anova_20250912110922.r deleted file mode 100644 index e194b3a..0000000 --- a/.history/mixed anova_20250912110922.r +++ /dev/null @@ -1,283 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(afex) -library(emmeans) -library(ggplot2) -library(car) - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# Prepare data for mixed ANOVA -# We'll reshape the data to have Past and Future as separate rows for each participant - -# Create a function to reshape data for a specific domain -reshape_domain_data <- function(data, domain_name) { - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - return(NULL) - } - - # Create long format data - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate(TimePerspective = "Past", - Difference = .data[[past_col]]) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate(TimePerspective = "Future", - Difference = .data[[fut_col]]) %>% - select(-all_of(fut_col)) - - combined_data <- rbind(past_data, fut_data) %>% - mutate(TimePerspective = as.factor(TimePerspective), - pID = as.factor(pID)) - - return(combined_data) -} - -# Define domains to analyze -domains <- c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change") - -# Run mixed ANOVA for each domain -results_list <- list() - -for (domain in domains) { - cat("\n", "="*60, "\n") - cat("ANALYZING DOMAIN:", toupper(domain), "\n") - cat("="*60, "\n") - - # Reshape data for this domain - domain_data <- reshape_domain_data(data, domain) - - if (is.null(domain_data)) { - next - } - - # Check for missing values - missing_count <- sum(is.na(domain_data$Difference)) - if (missing_count > 0) { - cat("Warning:", missing_count, "missing values found for", domain, "\n") - domain_data <- domain_data[!is.na(domain_data$Difference), ] - } - - # Descriptive statistics - cat("\nDescriptive Statistics:\n") - desc_stats <- domain_data %>% - group_by(TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference, na.rm = TRUE), - sd = sd(Difference, na.rm = TRUE), - median = median(Difference, na.rm = TRUE), - .groups = 'drop' - ) - print(desc_stats) - - # Effect size (Cohen's d) - past_diff <- domain_data$Difference[domain_data$TimePerspective == "Past"] - fut_diff <- domain_data$Difference[domain_data$TimePerspective == "Future"] - - pooled_sd <- sqrt(((length(past_diff) - 1) * var(past_diff) + - (length(fut_diff) - 1) * var(fut_diff)) / - (length(past_diff) + length(fut_diff) - 2)) - - cohens_d <- (mean(past_diff) - mean(fut_diff)) / pooled_sd - - cat("\nCohen's d (Past vs Future):", round(cohens_d, 5), "\n") - - # Mixed ANOVA using ezANOVA - tryCatch({ - # Simple repeated measures ANOVA (TimePerspective as within-subjects) - anova_result <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("\nMixed ANOVA Results:\n") - print(anova_result) - - # Store results - results_list[[domain]] <- list( - descriptive = desc_stats, - cohens_d = cohens_d, - anova = anova_result - ) - - # Post-hoc comparisons if significant - if (!is.null(anova_result$ANOVA)) { - if (any(anova_result$ANOVA$p < 0.05, na.rm = TRUE)) { - cat("\nSignificant effects found! Post-hoc analysis:\n") - - # Pairwise comparisons for TimePerspective - if ("TimePerspective" %in% anova_result$ANOVA$Effect && - anova_result$ANOVA$p[anova_result$ANOVA$Effect == "TimePerspective"] < 0.05) { - - # Simple t-test for comparison - t_test_result <- t.test(past_diff, fut_diff, paired = TRUE) - cat("\nPaired t-test (Past vs Future):\n") - cat("t =", round(t_test_result$statistic, 3), - ", df =", t_test_result$parameter, - ", p =", round(t_test_result$p.value, 5), "\n") - } - } - } - - }, error = function(e) { - cat("Error in ANOVA for", domain, ":", e$message, "\n") - - # Fallback to simple paired t-test - t_test_result <- t.test(past_diff, fut_diff, paired = TRUE) - cat("\nFallback: Paired t-test (Past vs Future):\n") - cat("t =", round(t_test_result$statistic, 3), - ", df =", t_test_result$parameter, - ", p =", round(t_test_result$p.value, 5), "\n") - - results_list[[domain]] <- list( - descriptive = desc_stats, - cohens_d = cohens_d, - t_test = t_test_result - ) - }) -} - -# Summary of all results -cat("\n", "="*80, "\n") -cat("SUMMARY OF ALL DOMAINS\n") -cat("="*80, "\n") - -summary_df <- data.frame( - Domain = character(), - Past_Mean = numeric(), - Future_Mean = numeric(), - Cohen_d = numeric(), - Significant = logical(), - stringsAsFactors = FALSE -) - -for (domain in names(results_list)) { - result <- results_list[[domain]] - - past_mean <- result$descriptive$mean[result$descriptive$TimePerspective == "Past"] - fut_mean <- result$descriptive$mean[result$descriptive$TimePerspective == "Future"] - cohens_d <- result$cohens_d - - # Check if significant (p < 0.05) - significant <- FALSE - if (!is.null(result$anova) && !is.null(result$anova$ANOVA)) { - if ("TimePerspective" %in% result$anova$ANOVA$Effect) { - p_val <- result$anova$ANOVA$p[result$anova$ANOVA$Effect == "TimePerspective"] - significant <- !is.na(p_val) && p_val < 0.05 - } - } else if (!is.null(result$t_test)) { - significant <- result$t_test$p.value < 0.05 - } - - summary_df <- rbind(summary_df, data.frame( - Domain = domain, - Past_Mean = round(past_mean, 3), - Future_Mean = round(fut_mean, 3), - Cohen_d = round(cohens_d, 5), - Significant = significant - )) -} - -# Sort by effect size (absolute value) -summary_df <- summary_df[order(abs(summary_df$Cohen_d), decreasing = TRUE), ] - -print(summary_df) - -# Create visualization -library(ggplot2) - -# Prepare data for plotting -plot_data <- summary_df %>% - mutate( - Effect_Size = abs(Cohen_d), - Direction = ifelse(Cohen_d > 0, "Past > Future", "Future > Past"), - Domain_Type = case_when( - grepl("pref_", Domain) ~ "Preferences", - grepl("pers_", Domain) ~ "Personality", - grepl("val_", Domain) ~ "Values", - grepl("life_", Domain) ~ "Life Satisfaction", - TRUE ~ "Other" - ) - ) - -# Effect size plot -p1 <- ggplot(plot_data, aes(x = reorder(Domain, Effect_Size), y = Effect_Size, - fill = Direction, alpha = Significant)) + - geom_col() + - coord_flip() + - scale_alpha_manual(values = c(0.5, 1), name = "Significant\n(p < 0.05)") + - scale_fill_manual(values = c("Past > Future" = "#E74C3C", "Future > Past" = "#3498DB")) + - labs( - title = "Effect Sizes: Past vs Future Differences", - subtitle = "Absolute Cohen's d values across domains", - x = "Domain", - y = "|Cohen's d|", - fill = "Direction" - ) + - theme_minimal() + - theme(axis.text.y = element_text(size = 8)) - -print(p1) - -# Mean differences plot -plot_data_long <- summary_df %>% - select(Domain, Past_Mean, Future_Mean) %>% - pivot_longer(cols = c(Past_Mean, Future_Mean), - names_to = "TimePerspective", - values_to = "Mean_Difference") %>% - mutate(TimePerspective = gsub("_Mean", "", TimePerspective)) - -p2 <- ggplot(plot_data_long, aes(x = reorder(Domain, Mean_Difference), - y = Mean_Difference, - fill = TimePerspective)) + - geom_col(position = "dodge") + - coord_flip() + - scale_fill_manual(values = c("Past" = "#E74C3C", "Future" = "#3498DB")) + - labs( - title = "Mean Differences by Time Perspective", - subtitle = "Past vs Future difference scores", - x = "Domain", - y = "Mean Difference Score", - fill = "Time Perspective" - ) + - theme_minimal() + - theme(axis.text.y = element_text(size = 8)) - -print(p2) - -cat("\nAnalysis complete! Check the plots and summary table above.\n") -cat("Key findings:\n") -cat("- Domains with largest effect sizes:", paste(head(summary_df$Domain, 3), collapse = ", "), "\n") -cat("- Number of significant differences:", sum(summary_df$Significant), "out of", nrow(summary_df), "\n") diff --git a/.history/mixed anova_20250912110938.r b/.history/mixed anova_20250912110938.r deleted file mode 100644 index e194b3a..0000000 --- a/.history/mixed anova_20250912110938.r +++ /dev/null @@ -1,283 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(afex) -library(emmeans) -library(ggplot2) -library(car) - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# Prepare data for mixed ANOVA -# We'll reshape the data to have Past and Future as separate rows for each participant - -# Create a function to reshape data for a specific domain -reshape_domain_data <- function(data, domain_name) { - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - return(NULL) - } - - # Create long format data - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate(TimePerspective = "Past", - Difference = .data[[past_col]]) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate(TimePerspective = "Future", - Difference = .data[[fut_col]]) %>% - select(-all_of(fut_col)) - - combined_data <- rbind(past_data, fut_data) %>% - mutate(TimePerspective = as.factor(TimePerspective), - pID = as.factor(pID)) - - return(combined_data) -} - -# Define domains to analyze -domains <- c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change") - -# Run mixed ANOVA for each domain -results_list <- list() - -for (domain in domains) { - cat("\n", "="*60, "\n") - cat("ANALYZING DOMAIN:", toupper(domain), "\n") - cat("="*60, "\n") - - # Reshape data for this domain - domain_data <- reshape_domain_data(data, domain) - - if (is.null(domain_data)) { - next - } - - # Check for missing values - missing_count <- sum(is.na(domain_data$Difference)) - if (missing_count > 0) { - cat("Warning:", missing_count, "missing values found for", domain, "\n") - domain_data <- domain_data[!is.na(domain_data$Difference), ] - } - - # Descriptive statistics - cat("\nDescriptive Statistics:\n") - desc_stats <- domain_data %>% - group_by(TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference, na.rm = TRUE), - sd = sd(Difference, na.rm = TRUE), - median = median(Difference, na.rm = TRUE), - .groups = 'drop' - ) - print(desc_stats) - - # Effect size (Cohen's d) - past_diff <- domain_data$Difference[domain_data$TimePerspective == "Past"] - fut_diff <- domain_data$Difference[domain_data$TimePerspective == "Future"] - - pooled_sd <- sqrt(((length(past_diff) - 1) * var(past_diff) + - (length(fut_diff) - 1) * var(fut_diff)) / - (length(past_diff) + length(fut_diff) - 2)) - - cohens_d <- (mean(past_diff) - mean(fut_diff)) / pooled_sd - - cat("\nCohen's d (Past vs Future):", round(cohens_d, 5), "\n") - - # Mixed ANOVA using ezANOVA - tryCatch({ - # Simple repeated measures ANOVA (TimePerspective as within-subjects) - anova_result <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("\nMixed ANOVA Results:\n") - print(anova_result) - - # Store results - results_list[[domain]] <- list( - descriptive = desc_stats, - cohens_d = cohens_d, - anova = anova_result - ) - - # Post-hoc comparisons if significant - if (!is.null(anova_result$ANOVA)) { - if (any(anova_result$ANOVA$p < 0.05, na.rm = TRUE)) { - cat("\nSignificant effects found! Post-hoc analysis:\n") - - # Pairwise comparisons for TimePerspective - if ("TimePerspective" %in% anova_result$ANOVA$Effect && - anova_result$ANOVA$p[anova_result$ANOVA$Effect == "TimePerspective"] < 0.05) { - - # Simple t-test for comparison - t_test_result <- t.test(past_diff, fut_diff, paired = TRUE) - cat("\nPaired t-test (Past vs Future):\n") - cat("t =", round(t_test_result$statistic, 3), - ", df =", t_test_result$parameter, - ", p =", round(t_test_result$p.value, 5), "\n") - } - } - } - - }, error = function(e) { - cat("Error in ANOVA for", domain, ":", e$message, "\n") - - # Fallback to simple paired t-test - t_test_result <- t.test(past_diff, fut_diff, paired = TRUE) - cat("\nFallback: Paired t-test (Past vs Future):\n") - cat("t =", round(t_test_result$statistic, 3), - ", df =", t_test_result$parameter, - ", p =", round(t_test_result$p.value, 5), "\n") - - results_list[[domain]] <- list( - descriptive = desc_stats, - cohens_d = cohens_d, - t_test = t_test_result - ) - }) -} - -# Summary of all results -cat("\n", "="*80, "\n") -cat("SUMMARY OF ALL DOMAINS\n") -cat("="*80, "\n") - -summary_df <- data.frame( - Domain = character(), - Past_Mean = numeric(), - Future_Mean = numeric(), - Cohen_d = numeric(), - Significant = logical(), - stringsAsFactors = FALSE -) - -for (domain in names(results_list)) { - result <- results_list[[domain]] - - past_mean <- result$descriptive$mean[result$descriptive$TimePerspective == "Past"] - fut_mean <- result$descriptive$mean[result$descriptive$TimePerspective == "Future"] - cohens_d <- result$cohens_d - - # Check if significant (p < 0.05) - significant <- FALSE - if (!is.null(result$anova) && !is.null(result$anova$ANOVA)) { - if ("TimePerspective" %in% result$anova$ANOVA$Effect) { - p_val <- result$anova$ANOVA$p[result$anova$ANOVA$Effect == "TimePerspective"] - significant <- !is.na(p_val) && p_val < 0.05 - } - } else if (!is.null(result$t_test)) { - significant <- result$t_test$p.value < 0.05 - } - - summary_df <- rbind(summary_df, data.frame( - Domain = domain, - Past_Mean = round(past_mean, 3), - Future_Mean = round(fut_mean, 3), - Cohen_d = round(cohens_d, 5), - Significant = significant - )) -} - -# Sort by effect size (absolute value) -summary_df <- summary_df[order(abs(summary_df$Cohen_d), decreasing = TRUE), ] - -print(summary_df) - -# Create visualization -library(ggplot2) - -# Prepare data for plotting -plot_data <- summary_df %>% - mutate( - Effect_Size = abs(Cohen_d), - Direction = ifelse(Cohen_d > 0, "Past > Future", "Future > Past"), - Domain_Type = case_when( - grepl("pref_", Domain) ~ "Preferences", - grepl("pers_", Domain) ~ "Personality", - grepl("val_", Domain) ~ "Values", - grepl("life_", Domain) ~ "Life Satisfaction", - TRUE ~ "Other" - ) - ) - -# Effect size plot -p1 <- ggplot(plot_data, aes(x = reorder(Domain, Effect_Size), y = Effect_Size, - fill = Direction, alpha = Significant)) + - geom_col() + - coord_flip() + - scale_alpha_manual(values = c(0.5, 1), name = "Significant\n(p < 0.05)") + - scale_fill_manual(values = c("Past > Future" = "#E74C3C", "Future > Past" = "#3498DB")) + - labs( - title = "Effect Sizes: Past vs Future Differences", - subtitle = "Absolute Cohen's d values across domains", - x = "Domain", - y = "|Cohen's d|", - fill = "Direction" - ) + - theme_minimal() + - theme(axis.text.y = element_text(size = 8)) - -print(p1) - -# Mean differences plot -plot_data_long <- summary_df %>% - select(Domain, Past_Mean, Future_Mean) %>% - pivot_longer(cols = c(Past_Mean, Future_Mean), - names_to = "TimePerspective", - values_to = "Mean_Difference") %>% - mutate(TimePerspective = gsub("_Mean", "", TimePerspective)) - -p2 <- ggplot(plot_data_long, aes(x = reorder(Domain, Mean_Difference), - y = Mean_Difference, - fill = TimePerspective)) + - geom_col(position = "dodge") + - coord_flip() + - scale_fill_manual(values = c("Past" = "#E74C3C", "Future" = "#3498DB")) + - labs( - title = "Mean Differences by Time Perspective", - subtitle = "Past vs Future difference scores", - x = "Domain", - y = "Mean Difference Score", - fill = "Time Perspective" - ) + - theme_minimal() + - theme(axis.text.y = element_text(size = 8)) - -print(p2) - -cat("\nAnalysis complete! Check the plots and summary table above.\n") -cat("Key findings:\n") -cat("- Domains with largest effect sizes:", paste(head(summary_df$Domain, 3), collapse = ", "), "\n") -cat("- Number of significant differences:", sum(summary_df$Significant), "out of", nrow(summary_df), "\n") diff --git a/.history/mixed anova_20250912111710.r b/.history/mixed anova_20250912111710.r deleted file mode 100644 index 892c6ae..0000000 --- a/.history/mixed anova_20250912111710.r +++ /dev/null @@ -1,281 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# Prepare data for mixed ANOVA -# We'll reshape the data to have Past and Future as separate rows for each participant - -# Create a function to reshape data for a specific domain -reshape_domain_data <- function(data, domain_name) { - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - return(NULL) - } - - # Create long format data - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate(TimePerspective = "Past", - Difference = .data[[past_col]]) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate(TimePerspective = "Future", - Difference = .data[[fut_col]]) %>% - select(-all_of(fut_col)) - - combined_data <- rbind(past_data, fut_data) %>% - mutate(TimePerspective = as.factor(TimePerspective), - pID = as.factor(pID)) - - return(combined_data) -} - -# Define domains to analyze -domains <- c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change") - -# Run mixed ANOVA for each domain -results_list <- list() - -for (domain in domains) { - cat("\n", "="*60, "\n") - cat("ANALYZING DOMAIN:", toupper(domain), "\n") - cat("="*60, "\n") - - # Reshape data for this domain - domain_data <- reshape_domain_data(data, domain) - - if (is.null(domain_data)) { - next - } - - # Check for missing values - missing_count <- sum(is.na(domain_data$Difference)) - if (missing_count > 0) { - cat("Warning:", missing_count, "missing values found for", domain, "\n") - domain_data <- domain_data[!is.na(domain_data$Difference), ] - } - - # Descriptive statistics - cat("\nDescriptive Statistics:\n") - desc_stats <- domain_data %>% - group_by(TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference, na.rm = TRUE), - sd = sd(Difference, na.rm = TRUE), - median = median(Difference, na.rm = TRUE), - .groups = 'drop' - ) - print(desc_stats) - - # Effect size (Cohen's d) - past_diff <- domain_data$Difference[domain_data$TimePerspective == "Past"] - fut_diff <- domain_data$Difference[domain_data$TimePerspective == "Future"] - - pooled_sd <- sqrt(((length(past_diff) - 1) * var(past_diff) + - (length(fut_diff) - 1) * var(fut_diff)) / - (length(past_diff) + length(fut_diff) - 2)) - - cohens_d <- (mean(past_diff) - mean(fut_diff)) / pooled_sd - - cat("\nCohen's d (Past vs Future):", round(cohens_d, 5), "\n") - - # Mixed ANOVA using ezANOVA - tryCatch({ - # Simple repeated measures ANOVA (TimePerspective as within-subjects) - anova_result <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("\nMixed ANOVA Results:\n") - print(anova_result) - - # Store results - results_list[[domain]] <- list( - descriptive = desc_stats, - cohens_d = cohens_d, - anova = anova_result - ) - - # Post-hoc comparisons if significant - if (!is.null(anova_result$ANOVA)) { - if (any(anova_result$ANOVA$p < 0.05, na.rm = TRUE)) { - cat("\nSignificant effects found! Post-hoc analysis:\n") - - # Pairwise comparisons for TimePerspective - if ("TimePerspective" %in% anova_result$ANOVA$Effect && - anova_result$ANOVA$p[anova_result$ANOVA$Effect == "TimePerspective"] < 0.05) { - - # Simple t-test for comparison - t_test_result <- t.test(past_diff, fut_diff, paired = TRUE) - cat("\nPaired t-test (Past vs Future):\n") - cat("t =", round(t_test_result$statistic, 3), - ", df =", t_test_result$parameter, - ", p =", round(t_test_result$p.value, 5), "\n") - } - } - } - - }, error = function(e) { - cat("Error in ANOVA for", domain, ":", e$message, "\n") - - # Fallback to simple paired t-test - t_test_result <- t.test(past_diff, fut_diff, paired = TRUE) - cat("\nFallback: Paired t-test (Past vs Future):\n") - cat("t =", round(t_test_result$statistic, 3), - ", df =", t_test_result$parameter, - ", p =", round(t_test_result$p.value, 5), "\n") - - results_list[[domain]] <- list( - descriptive = desc_stats, - cohens_d = cohens_d, - t_test = t_test_result - ) - }) -} - -# Summary of all results -cat("\n", "="*80, "\n") -cat("SUMMARY OF ALL DOMAINS\n") -cat("="*80, "\n") - -summary_df <- data.frame( - Domain = character(), - Past_Mean = numeric(), - Future_Mean = numeric(), - Cohen_d = numeric(), - Significant = logical(), - stringsAsFactors = FALSE -) - -for (domain in names(results_list)) { - result <- results_list[[domain]] - - past_mean <- result$descriptive$mean[result$descriptive$TimePerspective == "Past"] - fut_mean <- result$descriptive$mean[result$descriptive$TimePerspective == "Future"] - cohens_d <- result$cohens_d - - # Check if significant (p < 0.05) - significant <- FALSE - if (!is.null(result$anova) && !is.null(result$anova$ANOVA)) { - if ("TimePerspective" %in% result$anova$ANOVA$Effect) { - p_val <- result$anova$ANOVA$p[result$anova$ANOVA$Effect == "TimePerspective"] - significant <- !is.na(p_val) && p_val < 0.05 - } - } else if (!is.null(result$t_test)) { - significant <- result$t_test$p.value < 0.05 - } - - summary_df <- rbind(summary_df, data.frame( - Domain = domain, - Past_Mean = round(past_mean, 3), - Future_Mean = round(fut_mean, 3), - Cohen_d = round(cohens_d, 5), - Significant = significant - )) -} - -# Sort by effect size (absolute value) -summary_df <- summary_df[order(abs(summary_df$Cohen_d), decreasing = TRUE), ] - -print(summary_df) - -# Create visualization -library(ggplot2) - -# Prepare data for plotting -plot_data <- summary_df %>% - mutate( - Effect_Size = abs(Cohen_d), - Direction = ifelse(Cohen_d > 0, "Past > Future", "Future > Past"), - Domain_Type = case_when( - grepl("pref_", Domain) ~ "Preferences", - grepl("pers_", Domain) ~ "Personality", - grepl("val_", Domain) ~ "Values", - grepl("life_", Domain) ~ "Life Satisfaction", - TRUE ~ "Other" - ) - ) - -# Effect size plot -p1 <- ggplot(plot_data, aes(x = reorder(Domain, Effect_Size), y = Effect_Size, - fill = Direction, alpha = Significant)) + - geom_col() + - coord_flip() + - scale_alpha_manual(values = c(0.5, 1), name = "Significant\n(p < 0.05)") + - scale_fill_manual(values = c("Past > Future" = "#E74C3C", "Future > Past" = "#3498DB")) + - labs( - title = "Effect Sizes: Past vs Future Differences", - subtitle = "Absolute Cohen's d values across domains", - x = "Domain", - y = "|Cohen's d|", - fill = "Direction" - ) + - theme_minimal() + - theme(axis.text.y = element_text(size = 8)) - -print(p1) - -# Mean differences plot -plot_data_long <- summary_df %>% - select(Domain, Past_Mean, Future_Mean) %>% - pivot_longer(cols = c(Past_Mean, Future_Mean), - names_to = "TimePerspective", - values_to = "Mean_Difference") %>% - mutate(TimePerspective = gsub("_Mean", "", TimePerspective)) - -p2 <- ggplot(plot_data_long, aes(x = reorder(Domain, Mean_Difference), - y = Mean_Difference, - fill = TimePerspective)) + - geom_col(position = "dodge") + - coord_flip() + - scale_fill_manual(values = c("Past" = "#E74C3C", "Future" = "#3498DB")) + - labs( - title = "Mean Differences by Time Perspective", - subtitle = "Past vs Future difference scores", - x = "Domain", - y = "Mean Difference Score", - fill = "Time Perspective" - ) + - theme_minimal() + - theme(axis.text.y = element_text(size = 8)) - -print(p2) - -cat("\nAnalysis complete! Check the plots and summary table above.\n") -cat("Key findings:\n") -cat("- Domains with largest effect sizes:", paste(head(summary_df$Domain, 3), collapse = ", "), "\n") -cat("- Number of significant differences:", sum(summary_df$Significant), "out of", nrow(summary_df), "\n") diff --git a/.history/mixed anova_20250912111725.r b/.history/mixed anova_20250912111725.r deleted file mode 100644 index 0c721ff..0000000 --- a/.history/mixed anova_20250912111725.r +++ /dev/null @@ -1,324 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors - all_long_data <- all_long_data %>% - mutate( - TimePerspective = as.factor(TimePerspective), - Domain_Type = as.factor(Domain_Type), - Domain_Item = as.factor(Domain_Item), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", unique(long_data$Domain_Type), "\n") - -# Define domains to analyze -domains <- c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change") - -# Run mixed ANOVA for each domain -results_list <- list() - -for (domain in domains) { - cat("\n", "="*60, "\n") - cat("ANALYZING DOMAIN:", toupper(domain), "\n") - cat("="*60, "\n") - - # Reshape data for this domain - domain_data <- reshape_domain_data(data, domain) - - if (is.null(domain_data)) { - next - } - - # Check for missing values - missing_count <- sum(is.na(domain_data$Difference)) - if (missing_count > 0) { - cat("Warning:", missing_count, "missing values found for", domain, "\n") - domain_data <- domain_data[!is.na(domain_data$Difference), ] - } - - # Descriptive statistics - cat("\nDescriptive Statistics:\n") - desc_stats <- domain_data %>% - group_by(TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference, na.rm = TRUE), - sd = sd(Difference, na.rm = TRUE), - median = median(Difference, na.rm = TRUE), - .groups = 'drop' - ) - print(desc_stats) - - # Effect size (Cohen's d) - past_diff <- domain_data$Difference[domain_data$TimePerspective == "Past"] - fut_diff <- domain_data$Difference[domain_data$TimePerspective == "Future"] - - pooled_sd <- sqrt(((length(past_diff) - 1) * var(past_diff) + - (length(fut_diff) - 1) * var(fut_diff)) / - (length(past_diff) + length(fut_diff) - 2)) - - cohens_d <- (mean(past_diff) - mean(fut_diff)) / pooled_sd - - cat("\nCohen's d (Past vs Future):", round(cohens_d, 5), "\n") - - # Mixed ANOVA using ezANOVA - tryCatch({ - # Simple repeated measures ANOVA (TimePerspective as within-subjects) - anova_result <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("\nMixed ANOVA Results:\n") - print(anova_result) - - # Store results - results_list[[domain]] <- list( - descriptive = desc_stats, - cohens_d = cohens_d, - anova = anova_result - ) - - # Post-hoc comparisons if significant - if (!is.null(anova_result$ANOVA)) { - if (any(anova_result$ANOVA$p < 0.05, na.rm = TRUE)) { - cat("\nSignificant effects found! Post-hoc analysis:\n") - - # Pairwise comparisons for TimePerspective - if ("TimePerspective" %in% anova_result$ANOVA$Effect && - anova_result$ANOVA$p[anova_result$ANOVA$Effect == "TimePerspective"] < 0.05) { - - # Simple t-test for comparison - t_test_result <- t.test(past_diff, fut_diff, paired = TRUE) - cat("\nPaired t-test (Past vs Future):\n") - cat("t =", round(t_test_result$statistic, 3), - ", df =", t_test_result$parameter, - ", p =", round(t_test_result$p.value, 5), "\n") - } - } - } - - }, error = function(e) { - cat("Error in ANOVA for", domain, ":", e$message, "\n") - - # Fallback to simple paired t-test - t_test_result <- t.test(past_diff, fut_diff, paired = TRUE) - cat("\nFallback: Paired t-test (Past vs Future):\n") - cat("t =", round(t_test_result$statistic, 3), - ", df =", t_test_result$parameter, - ", p =", round(t_test_result$p.value, 5), "\n") - - results_list[[domain]] <- list( - descriptive = desc_stats, - cohens_d = cohens_d, - t_test = t_test_result - ) - }) -} - -# Summary of all results -cat("\n", "="*80, "\n") -cat("SUMMARY OF ALL DOMAINS\n") -cat("="*80, "\n") - -summary_df <- data.frame( - Domain = character(), - Past_Mean = numeric(), - Future_Mean = numeric(), - Cohen_d = numeric(), - Significant = logical(), - stringsAsFactors = FALSE -) - -for (domain in names(results_list)) { - result <- results_list[[domain]] - - past_mean <- result$descriptive$mean[result$descriptive$TimePerspective == "Past"] - fut_mean <- result$descriptive$mean[result$descriptive$TimePerspective == "Future"] - cohens_d <- result$cohens_d - - # Check if significant (p < 0.05) - significant <- FALSE - if (!is.null(result$anova) && !is.null(result$anova$ANOVA)) { - if ("TimePerspective" %in% result$anova$ANOVA$Effect) { - p_val <- result$anova$ANOVA$p[result$anova$ANOVA$Effect == "TimePerspective"] - significant <- !is.na(p_val) && p_val < 0.05 - } - } else if (!is.null(result$t_test)) { - significant <- result$t_test$p.value < 0.05 - } - - summary_df <- rbind(summary_df, data.frame( - Domain = domain, - Past_Mean = round(past_mean, 3), - Future_Mean = round(fut_mean, 3), - Cohen_d = round(cohens_d, 5), - Significant = significant - )) -} - -# Sort by effect size (absolute value) -summary_df <- summary_df[order(abs(summary_df$Cohen_d), decreasing = TRUE), ] - -print(summary_df) - -# Create visualization -library(ggplot2) - -# Prepare data for plotting -plot_data <- summary_df %>% - mutate( - Effect_Size = abs(Cohen_d), - Direction = ifelse(Cohen_d > 0, "Past > Future", "Future > Past"), - Domain_Type = case_when( - grepl("pref_", Domain) ~ "Preferences", - grepl("pers_", Domain) ~ "Personality", - grepl("val_", Domain) ~ "Values", - grepl("life_", Domain) ~ "Life Satisfaction", - TRUE ~ "Other" - ) - ) - -# Effect size plot -p1 <- ggplot(plot_data, aes(x = reorder(Domain, Effect_Size), y = Effect_Size, - fill = Direction, alpha = Significant)) + - geom_col() + - coord_flip() + - scale_alpha_manual(values = c(0.5, 1), name = "Significant\n(p < 0.05)") + - scale_fill_manual(values = c("Past > Future" = "#E74C3C", "Future > Past" = "#3498DB")) + - labs( - title = "Effect Sizes: Past vs Future Differences", - subtitle = "Absolute Cohen's d values across domains", - x = "Domain", - y = "|Cohen's d|", - fill = "Direction" - ) + - theme_minimal() + - theme(axis.text.y = element_text(size = 8)) - -print(p1) - -# Mean differences plot -plot_data_long <- summary_df %>% - select(Domain, Past_Mean, Future_Mean) %>% - pivot_longer(cols = c(Past_Mean, Future_Mean), - names_to = "TimePerspective", - values_to = "Mean_Difference") %>% - mutate(TimePerspective = gsub("_Mean", "", TimePerspective)) - -p2 <- ggplot(plot_data_long, aes(x = reorder(Domain, Mean_Difference), - y = Mean_Difference, - fill = TimePerspective)) + - geom_col(position = "dodge") + - coord_flip() + - scale_fill_manual(values = c("Past" = "#E74C3C", "Future" = "#3498DB")) + - labs( - title = "Mean Differences by Time Perspective", - subtitle = "Past vs Future difference scores", - x = "Domain", - y = "Mean Difference Score", - fill = "Time Perspective" - ) + - theme_minimal() + - theme(axis.text.y = element_text(size = 8)) - -print(p2) - -cat("\nAnalysis complete! Check the plots and summary table above.\n") -cat("Key findings:\n") -cat("- Domains with largest effect sizes:", paste(head(summary_df$Domain, 3), collapse = ", "), "\n") -cat("- Number of significant differences:", sum(summary_df$Significant), "out of", nrow(summary_df), "\n") diff --git a/.history/mixed anova_20250912111750.r b/.history/mixed anova_20250912111750.r deleted file mode 100644 index 83c0404..0000000 --- a/.history/mixed anova_20250912111750.r +++ /dev/null @@ -1,316 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors - all_long_data <- all_long_data %>% - mutate( - TimePerspective = as.factor(TimePerspective), - Domain_Type = as.factor(Domain_Type), - Domain_Item = as.factor(Domain_Item), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", unique(long_data$Domain_Type), "\n") - -# STEP 2: ASSUMPTION CHECKING -cat("\n", "="*80, "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat("="*80, "\n") - -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", "="*80, "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat("="*80, "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# Summary of all results -cat("\n", "="*80, "\n") -cat("SUMMARY OF ALL DOMAINS\n") -cat("="*80, "\n") - -summary_df <- data.frame( - Domain = character(), - Past_Mean = numeric(), - Future_Mean = numeric(), - Cohen_d = numeric(), - Significant = logical(), - stringsAsFactors = FALSE -) - -for (domain in names(results_list)) { - result <- results_list[[domain]] - - past_mean <- result$descriptive$mean[result$descriptive$TimePerspective == "Past"] - fut_mean <- result$descriptive$mean[result$descriptive$TimePerspective == "Future"] - cohens_d <- result$cohens_d - - # Check if significant (p < 0.05) - significant <- FALSE - if (!is.null(result$anova) && !is.null(result$anova$ANOVA)) { - if ("TimePerspective" %in% result$anova$ANOVA$Effect) { - p_val <- result$anova$ANOVA$p[result$anova$ANOVA$Effect == "TimePerspective"] - significant <- !is.na(p_val) && p_val < 0.05 - } - } else if (!is.null(result$t_test)) { - significant <- result$t_test$p.value < 0.05 - } - - summary_df <- rbind(summary_df, data.frame( - Domain = domain, - Past_Mean = round(past_mean, 3), - Future_Mean = round(fut_mean, 3), - Cohen_d = round(cohens_d, 5), - Significant = significant - )) -} - -# Sort by effect size (absolute value) -summary_df <- summary_df[order(abs(summary_df$Cohen_d), decreasing = TRUE), ] - -print(summary_df) - -# Create visualization -library(ggplot2) - -# Prepare data for plotting -plot_data <- summary_df %>% - mutate( - Effect_Size = abs(Cohen_d), - Direction = ifelse(Cohen_d > 0, "Past > Future", "Future > Past"), - Domain_Type = case_when( - grepl("pref_", Domain) ~ "Preferences", - grepl("pers_", Domain) ~ "Personality", - grepl("val_", Domain) ~ "Values", - grepl("life_", Domain) ~ "Life Satisfaction", - TRUE ~ "Other" - ) - ) - -# Effect size plot -p1 <- ggplot(plot_data, aes(x = reorder(Domain, Effect_Size), y = Effect_Size, - fill = Direction, alpha = Significant)) + - geom_col() + - coord_flip() + - scale_alpha_manual(values = c(0.5, 1), name = "Significant\n(p < 0.05)") + - scale_fill_manual(values = c("Past > Future" = "#E74C3C", "Future > Past" = "#3498DB")) + - labs( - title = "Effect Sizes: Past vs Future Differences", - subtitle = "Absolute Cohen's d values across domains", - x = "Domain", - y = "|Cohen's d|", - fill = "Direction" - ) + - theme_minimal() + - theme(axis.text.y = element_text(size = 8)) - -print(p1) - -# Mean differences plot -plot_data_long <- summary_df %>% - select(Domain, Past_Mean, Future_Mean) %>% - pivot_longer(cols = c(Past_Mean, Future_Mean), - names_to = "TimePerspective", - values_to = "Mean_Difference") %>% - mutate(TimePerspective = gsub("_Mean", "", TimePerspective)) - -p2 <- ggplot(plot_data_long, aes(x = reorder(Domain, Mean_Difference), - y = Mean_Difference, - fill = TimePerspective)) + - geom_col(position = "dodge") + - coord_flip() + - scale_fill_manual(values = c("Past" = "#E74C3C", "Future" = "#3498DB")) + - labs( - title = "Mean Differences by Time Perspective", - subtitle = "Past vs Future difference scores", - x = "Domain", - y = "Mean Difference Score", - fill = "Time Perspective" - ) + - theme_minimal() + - theme(axis.text.y = element_text(size = 8)) - -print(p2) - -cat("\nAnalysis complete! Check the plots and summary table above.\n") -cat("Key findings:\n") -cat("- Domains with largest effect sizes:", paste(head(summary_df$Domain, 3), collapse = ", "), "\n") -cat("- Number of significant differences:", sum(summary_df$Significant), "out of", nrow(summary_df), "\n") diff --git a/.history/mixed anova_20250912111819.r b/.history/mixed anova_20250912111819.r deleted file mode 100644 index b1cee98..0000000 --- a/.history/mixed anova_20250912111819.r +++ /dev/null @@ -1,341 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors - all_long_data <- all_long_data %>% - mutate( - TimePerspective = as.factor(TimePerspective), - Domain_Type = as.factor(Domain_Type), - Domain_Item = as.factor(Domain_Item), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", unique(long_data$Domain_Type), "\n") - -# STEP 2: ASSUMPTION CHECKING -cat("\n", "="*80, "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat("="*80, "\n") - -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", "="*80, "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat("="*80, "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", "="*80, "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat("="*80, "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat("-" * 50, "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat("-" * 50, "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat("-" * 50, "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", "="*80, "\n") -cat("ANALYSIS COMPLETE!\n") -cat("="*80, "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912111827.r b/.history/mixed anova_20250912111827.r deleted file mode 100644 index b1cee98..0000000 --- a/.history/mixed anova_20250912111827.r +++ /dev/null @@ -1,341 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors - all_long_data <- all_long_data %>% - mutate( - TimePerspective = as.factor(TimePerspective), - Domain_Type = as.factor(Domain_Type), - Domain_Item = as.factor(Domain_Item), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", unique(long_data$Domain_Type), "\n") - -# STEP 2: ASSUMPTION CHECKING -cat("\n", "="*80, "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat("="*80, "\n") - -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", "="*80, "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat("="*80, "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", "="*80, "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat("="*80, "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat("-" * 50, "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat("-" * 50, "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat("-" * 50, "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", "="*80, "\n") -cat("ANALYSIS COMPLETE!\n") -cat("="*80, "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912112005.r b/.history/mixed anova_20250912112005.r deleted file mode 100644 index b1cee98..0000000 --- a/.history/mixed anova_20250912112005.r +++ /dev/null @@ -1,341 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors - all_long_data <- all_long_data %>% - mutate( - TimePerspective = as.factor(TimePerspective), - Domain_Type = as.factor(Domain_Type), - Domain_Item = as.factor(Domain_Item), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", unique(long_data$Domain_Type), "\n") - -# STEP 2: ASSUMPTION CHECKING -cat("\n", "="*80, "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat("="*80, "\n") - -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", "="*80, "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat("="*80, "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", "="*80, "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat("="*80, "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat("-" * 50, "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat("-" * 50, "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat("-" * 50, "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", "="*80, "\n") -cat("ANALYSIS COMPLETE!\n") -cat("="*80, "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912112141.r b/.history/mixed anova_20250912112141.r deleted file mode 100644 index b1cee98..0000000 --- a/.history/mixed anova_20250912112141.r +++ /dev/null @@ -1,341 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors - all_long_data <- all_long_data %>% - mutate( - TimePerspective = as.factor(TimePerspective), - Domain_Type = as.factor(Domain_Type), - Domain_Item = as.factor(Domain_Item), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", unique(long_data$Domain_Type), "\n") - -# STEP 2: ASSUMPTION CHECKING -cat("\n", "="*80, "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat("="*80, "\n") - -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", "="*80, "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat("="*80, "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", "="*80, "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat("="*80, "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat("-" * 50, "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat("-" * 50, "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat("-" * 50, "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", "="*80, "\n") -cat("ANALYSIS COMPLETE!\n") -cat("="*80, "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912112319.r b/.history/mixed anova_20250912112319.r deleted file mode 100644 index 83e7818..0000000 --- a/.history/mixed anova_20250912112319.r +++ /dev/null @@ -1,347 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors - all_long_data <- all_long_data %>% - mutate( - TimePerspective = as.factor(TimePerspective), - Domain_Type = as.factor(Domain_Type), - Domain_Item = as.factor(Domain_Item), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", unique(long_data$Domain_Type), "\n") - -# STEP 2: ASSUMPTION CHECKING -cat("\n", "="*80, "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat("="*80, "\n") - -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", "="*80, "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat("="*80, "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", "="*80, "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat("="*80, "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat("-" * 50, "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat("-" * 50, "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat("-" * 50, "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", "="*80, "\n") -cat("ANALYSIS COMPLETE!\n") -cat("="*80, "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912112327.r b/.history/mixed anova_20250912112327.r deleted file mode 100644 index 83e7818..0000000 --- a/.history/mixed anova_20250912112327.r +++ /dev/null @@ -1,347 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors - all_long_data <- all_long_data %>% - mutate( - TimePerspective = as.factor(TimePerspective), - Domain_Type = as.factor(Domain_Type), - Domain_Item = as.factor(Domain_Item), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", unique(long_data$Domain_Type), "\n") - -# STEP 2: ASSUMPTION CHECKING -cat("\n", "="*80, "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat("="*80, "\n") - -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", "="*80, "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat("="*80, "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", "="*80, "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat("="*80, "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat("-" * 50, "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat("-" * 50, "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat("-" * 50, "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", "="*80, "\n") -cat("ANALYSIS COMPLETE!\n") -cat("="*80, "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912112334.r b/.history/mixed anova_20250912112334.r deleted file mode 100644 index 83e7818..0000000 --- a/.history/mixed anova_20250912112334.r +++ /dev/null @@ -1,347 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors - all_long_data <- all_long_data %>% - mutate( - TimePerspective = as.factor(TimePerspective), - Domain_Type = as.factor(Domain_Type), - Domain_Item = as.factor(Domain_Item), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", unique(long_data$Domain_Type), "\n") - -# STEP 2: ASSUMPTION CHECKING -cat("\n", "="*80, "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat("="*80, "\n") - -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", "="*80, "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat("="*80, "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", "="*80, "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat("="*80, "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat("-" * 50, "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat("-" * 50, "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat("-" * 50, "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", "="*80, "\n") -cat("ANALYSIS COMPLETE!\n") -cat("="*80, "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912112620.r b/.history/mixed anova_20250912112620.r deleted file mode 100644 index 63b2b54..0000000 --- a/.history/mixed anova_20250912112620.r +++ /dev/null @@ -1,349 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors - all_long_data <- all_long_data %>% - mutate( - TimePerspective = as.factor(TimePerspective), - Domain_Type = as.factor(Domain_Type), - Domain_Item = as.factor(Domain_Item), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# STEP 2: ASSUMPTION CHECKING -cat("\n", "="*80, "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat("="*80, "\n") - -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", "="*80, "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat("="*80, "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", "="*80, "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat("="*80, "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat("-" * 50, "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat("-" * 50, "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat("-" * 50, "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", "="*80, "\n") -cat("ANALYSIS COMPLETE!\n") -cat("="*80, "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912112624.r b/.history/mixed anova_20250912112624.r deleted file mode 100644 index e5bb71b..0000000 --- a/.history/mixed anova_20250912112624.r +++ /dev/null @@ -1,349 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors - all_long_data <- all_long_data %>% - mutate( - TimePerspective = as.factor(TimePerspective), - Domain_Type = as.factor(Domain_Type), - Domain_Item = as.factor(Domain_Item), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", "="*80, "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat("="*80, "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", "="*80, "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat("="*80, "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat("-" * 50, "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat("-" * 50, "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat("-" * 50, "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", "="*80, "\n") -cat("ANALYSIS COMPLETE!\n") -cat("="*80, "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912112628.r b/.history/mixed anova_20250912112628.r deleted file mode 100644 index 3f9b63f..0000000 --- a/.history/mixed anova_20250912112628.r +++ /dev/null @@ -1,349 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors - all_long_data <- all_long_data %>% - mutate( - TimePerspective = as.factor(TimePerspective), - Domain_Type = as.factor(Domain_Type), - Domain_Item = as.factor(Domain_Item), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", "="*80, "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat("="*80, "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat("-" * 50, "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat("-" * 50, "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat("-" * 50, "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", "="*80, "\n") -cat("ANALYSIS COMPLETE!\n") -cat("="*80, "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912112631.r b/.history/mixed anova_20250912112631.r deleted file mode 100644 index f87b7a5..0000000 --- a/.history/mixed anova_20250912112631.r +++ /dev/null @@ -1,349 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors - all_long_data <- all_long_data %>% - mutate( - TimePerspective = as.factor(TimePerspective), - Domain_Type = as.factor(Domain_Type), - Domain_Item = as.factor(Domain_Item), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat("-" * 50, "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat("-" * 50, "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat("-" * 50, "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", "="*80, "\n") -cat("ANALYSIS COMPLETE!\n") -cat("="*80, "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912112632.r b/.history/mixed anova_20250912112632.r deleted file mode 100644 index f87b7a5..0000000 --- a/.history/mixed anova_20250912112632.r +++ /dev/null @@ -1,349 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors - all_long_data <- all_long_data %>% - mutate( - TimePerspective = as.factor(TimePerspective), - Domain_Type = as.factor(Domain_Type), - Domain_Item = as.factor(Domain_Item), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat("-" * 50, "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat("-" * 50, "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat("-" * 50, "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", "="*80, "\n") -cat("ANALYSIS COMPLETE!\n") -cat("="*80, "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912112635.r b/.history/mixed anova_20250912112635.r deleted file mode 100644 index 9594e48..0000000 --- a/.history/mixed anova_20250912112635.r +++ /dev/null @@ -1,349 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors - all_long_data <- all_long_data %>% - mutate( - TimePerspective = as.factor(TimePerspective), - Domain_Type = as.factor(Domain_Type), - Domain_Item = as.factor(Domain_Item), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat("-" * 50, "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat("-" * 50, "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", "="*80, "\n") -cat("ANALYSIS COMPLETE!\n") -cat("="*80, "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912112638.r b/.history/mixed anova_20250912112638.r deleted file mode 100644 index 31c3881..0000000 --- a/.history/mixed anova_20250912112638.r +++ /dev/null @@ -1,349 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors - all_long_data <- all_long_data %>% - mutate( - TimePerspective = as.factor(TimePerspective), - Domain_Type = as.factor(Domain_Type), - Domain_Item = as.factor(Domain_Item), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat("-" * 50, "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", "="*80, "\n") -cat("ANALYSIS COMPLETE!\n") -cat("="*80, "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912112641.r b/.history/mixed anova_20250912112641.r deleted file mode 100644 index e3d2e56..0000000 --- a/.history/mixed anova_20250912112641.r +++ /dev/null @@ -1,349 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors - all_long_data <- all_long_data %>% - mutate( - TimePerspective = as.factor(TimePerspective), - Domain_Type = as.factor(Domain_Type), - Domain_Item = as.factor(Domain_Item), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", "="*80, "\n") -cat("ANALYSIS COMPLETE!\n") -cat("="*80, "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912112645.r b/.history/mixed anova_20250912112645.r deleted file mode 100644 index 47d837c..0000000 --- a/.history/mixed anova_20250912112645.r +++ /dev/null @@ -1,349 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors - all_long_data <- all_long_data %>% - mutate( - TimePerspective = as.factor(TimePerspective), - Domain_Type = as.factor(Domain_Type), - Domain_Item = as.factor(Domain_Item), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912112651.r b/.history/mixed anova_20250912112651.r deleted file mode 100644 index 47d837c..0000000 --- a/.history/mixed anova_20250912112651.r +++ /dev/null @@ -1,349 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors - all_long_data <- all_long_data %>% - mutate( - TimePerspective = as.factor(TimePerspective), - Domain_Type = as.factor(Domain_Type), - Domain_Item = as.factor(Domain_Item), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912112700.r b/.history/mixed anova_20250912112700.r deleted file mode 100644 index 47d837c..0000000 --- a/.history/mixed anova_20250912112700.r +++ /dev/null @@ -1,349 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors - all_long_data <- all_long_data %>% - mutate( - TimePerspective = as.factor(TimePerspective), - Domain_Type = as.factor(Domain_Type), - Domain_Item = as.factor(Domain_Item), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912113047.r b/.history/mixed anova_20250912113047.r deleted file mode 100644 index 4e8abe9..0000000 --- a/.history/mixed anova_20250912113047.r +++ /dev/null @@ -1,359 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors - all_long_data <- all_long_data %>% - mutate( - TimePerspective = as.factor(TimePerspective), - Domain_Type = as.factor(Domain_Type), - Domain_Item = as.factor(Domain_Item), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# Display structure and sample of long_data -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -cat("\nColumn names:\n") -print(colnames(long_data)) - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912113055.r b/.history/mixed anova_20250912113055.r deleted file mode 100644 index 4e8abe9..0000000 --- a/.history/mixed anova_20250912113055.r +++ /dev/null @@ -1,359 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors - all_long_data <- all_long_data %>% - mutate( - TimePerspective = as.factor(TimePerspective), - Domain_Type = as.factor(Domain_Type), - Domain_Item = as.factor(Domain_Item), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# Display structure and sample of long_data -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -cat("\nColumn names:\n") -print(colnames(long_data)) - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912113119.r b/.history/mixed anova_20250912113119.r deleted file mode 100644 index 4e8abe9..0000000 --- a/.history/mixed anova_20250912113119.r +++ /dev/null @@ -1,359 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors - all_long_data <- all_long_data %>% - mutate( - TimePerspective = as.factor(TimePerspective), - Domain_Type = as.factor(Domain_Type), - Domain_Item = as.factor(Domain_Item), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# Display structure and sample of long_data -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -cat("\nColumn names:\n") -print(colnames(long_data)) - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912113345.r b/.history/mixed anova_20250912113345.r deleted file mode 100644 index be77f6e..0000000 --- a/.history/mixed anova_20250912113345.r +++ /dev/null @@ -1,374 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors - all_long_data <- all_long_data %>% - mutate( - TimePerspective = as.factor(TimePerspective), - Domain_Type = as.factor(Domain_Type), - Domain_Item = as.factor(Domain_Item), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# Display structure and sample of long_data -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -cat("\nColumn names:\n") -print(colnames(long_data)) - -# Show factor levels for domain variables -cat("\nDomain_Type factor levels:\n") -print(levels(long_data$Domain_Type)) - -cat("\nDomain_Item factor levels:\n") -print(levels(long_data$Domain_Item)) - -cat("\nTimePerspective factor levels:\n") -print(levels(long_data$TimePerspective)) - -# Show a sample with actual names instead of numbers -cat("\nSample data with actual names (first 6 rows):\n") -sample_data <- long_data[1:6, c("pID", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(sample_data) - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -head(long_data) -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912113353.r b/.history/mixed anova_20250912113353.r deleted file mode 100644 index be77f6e..0000000 --- a/.history/mixed anova_20250912113353.r +++ /dev/null @@ -1,374 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors - all_long_data <- all_long_data %>% - mutate( - TimePerspective = as.factor(TimePerspective), - Domain_Type = as.factor(Domain_Type), - Domain_Item = as.factor(Domain_Item), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# Display structure and sample of long_data -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -cat("\nColumn names:\n") -print(colnames(long_data)) - -# Show factor levels for domain variables -cat("\nDomain_Type factor levels:\n") -print(levels(long_data$Domain_Type)) - -cat("\nDomain_Item factor levels:\n") -print(levels(long_data$Domain_Item)) - -cat("\nTimePerspective factor levels:\n") -print(levels(long_data$TimePerspective)) - -# Show a sample with actual names instead of numbers -cat("\nSample data with actual names (first 6 rows):\n") -sample_data <- long_data[1:6, c("pID", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(sample_data) - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -head(long_data) -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912113448.r b/.history/mixed anova_20250912113448.r deleted file mode 100644 index 1a98479..0000000 --- a/.history/mixed anova_20250912113448.r +++ /dev/null @@ -1,377 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors with proper levels - all_long_data <- all_long_data %>% - mutate( - TimePerspective = factor(TimePerspective, levels = c("Past", "Future")), - Domain_Type = factor(Domain_Type, levels = c("Preferences", "Personality", "Values", "Life_Satisfaction")), - Domain_Item = factor(Domain_Item, levels = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change")), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# Display structure and sample of long_data -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -cat("\nColumn names:\n") -print(colnames(long_data)) - -# Show factor levels for domain variables -cat("\nDomain_Type factor levels:\n") -print(levels(long_data$Domain_Type)) - -cat("\nDomain_Item factor levels:\n") -print(levels(long_data$Domain_Item)) - -cat("\nTimePerspective factor levels:\n") -print(levels(long_data$TimePerspective)) - -# Show a sample with actual names instead of numbers -cat("\nSample data with actual names (first 6 rows):\n") -sample_data <- long_data[1:6, c("pID", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(sample_data) - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -head(long_data) -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912113455.r b/.history/mixed anova_20250912113455.r deleted file mode 100644 index 1a98479..0000000 --- a/.history/mixed anova_20250912113455.r +++ /dev/null @@ -1,377 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors with proper levels - all_long_data <- all_long_data %>% - mutate( - TimePerspective = factor(TimePerspective, levels = c("Past", "Future")), - Domain_Type = factor(Domain_Type, levels = c("Preferences", "Personality", "Values", "Life_Satisfaction")), - Domain_Item = factor(Domain_Item, levels = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change")), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# Display structure and sample of long_data -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -cat("\nColumn names:\n") -print(colnames(long_data)) - -# Show factor levels for domain variables -cat("\nDomain_Type factor levels:\n") -print(levels(long_data$Domain_Type)) - -cat("\nDomain_Item factor levels:\n") -print(levels(long_data$Domain_Item)) - -cat("\nTimePerspective factor levels:\n") -print(levels(long_data$TimePerspective)) - -# Show a sample with actual names instead of numbers -cat("\nSample data with actual names (first 6 rows):\n") -sample_data <- long_data[1:6, c("pID", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(sample_data) - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -head(long_data) -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912113502.r b/.history/mixed anova_20250912113502.r deleted file mode 100644 index 1a98479..0000000 --- a/.history/mixed anova_20250912113502.r +++ /dev/null @@ -1,377 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - past_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(past_col)) %>% - mutate( - TimePerspective = "Past", - Difference = .data[[past_col]], - Domain_Type = domain_type, # e.g., "Preferences" - Domain_Item = domain_name # e.g., "pref_read" - ) %>% - select(-all_of(past_col)) - - fut_data <- data %>% - select(pID, ResponseId, GROUP, TASK_DO, TEMPORAL_DO, ITEM_DO, COC_DO, - demo_sex, demo_age_1, AOT_total, CRT_correct, all_of(fut_col)) %>% - mutate( - TimePerspective = "Future", - Difference = .data[[fut_col]], - Domain_Type = domain_type, - Domain_Item = domain_name - ) %>% - select(-all_of(fut_col)) - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors with proper levels - all_long_data <- all_long_data %>% - mutate( - TimePerspective = factor(TimePerspective, levels = c("Past", "Future")), - Domain_Type = factor(Domain_Type, levels = c("Preferences", "Personality", "Values", "Life_Satisfaction")), - Domain_Item = factor(Domain_Item, levels = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change")), - pID = as.factor(pID) - ) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# Display structure and sample of long_data -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -cat("\nColumn names:\n") -print(colnames(long_data)) - -# Show factor levels for domain variables -cat("\nDomain_Type factor levels:\n") -print(levels(long_data$Domain_Type)) - -cat("\nDomain_Item factor levels:\n") -print(levels(long_data$Domain_Item)) - -cat("\nTimePerspective factor levels:\n") -print(levels(long_data$TimePerspective)) - -# Show a sample with actual names instead of numbers -cat("\nSample data with actual names (first 6 rows):\n") -sample_data <- long_data[1:6, c("pID", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(sample_data) - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -head(long_data) -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912113654.r b/.history/mixed anova_20250912113654.r deleted file mode 100644 index 40c09f8..0000000 --- a/.history/mixed anova_20250912113654.r +++ /dev/null @@ -1,370 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - using base R approach to avoid issues - # Past data - past_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", past_col)] - past_data$TimePerspective <- "Past" - past_data$Difference <- past_data[[past_col]] - past_data$Domain_Type <- domain_type - past_data$Domain_Item <- domain_name - past_data[[past_col]] <- NULL # Remove the original column - - # Future data - fut_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", fut_col)] - fut_data$TimePerspective <- "Future" - fut_data$Difference <- fut_data[[fut_col]] - fut_data$Domain_Type <- domain_type - fut_data$Domain_Item <- domain_name - fut_data[[fut_col]] <- NULL # Remove the original column - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors with proper levels - all_long_data$TimePerspective <- factor(all_long_data$TimePerspective, levels = c("Past", "Future")) - all_long_data$Domain_Type <- factor(all_long_data$Domain_Type, levels = c("Preferences", "Personality", "Values", "Life_Satisfaction")) - all_long_data$Domain_Item <- factor(all_long_data$Domain_Item, levels = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change")) - all_long_data$pID <- as.factor(all_long_data$pID) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# Display structure and sample of long_data -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -cat("\nColumn names:\n") -print(colnames(long_data)) - -# Show factor levels for domain variables -cat("\nDomain_Type factor levels:\n") -print(levels(long_data$Domain_Type)) - -cat("\nDomain_Item factor levels:\n") -print(levels(long_data$Domain_Item)) - -cat("\nTimePerspective factor levels:\n") -print(levels(long_data$TimePerspective)) - -# Show a sample with actual names instead of numbers -cat("\nSample data with actual names (first 6 rows):\n") -sample_data <- long_data[1:6, c("pID", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(sample_data) - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -head(long_data) -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912113659.r b/.history/mixed anova_20250912113659.r deleted file mode 100644 index 40c09f8..0000000 --- a/.history/mixed anova_20250912113659.r +++ /dev/null @@ -1,370 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - using base R approach to avoid issues - # Past data - past_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", past_col)] - past_data$TimePerspective <- "Past" - past_data$Difference <- past_data[[past_col]] - past_data$Domain_Type <- domain_type - past_data$Domain_Item <- domain_name - past_data[[past_col]] <- NULL # Remove the original column - - # Future data - fut_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", fut_col)] - fut_data$TimePerspective <- "Future" - fut_data$Difference <- fut_data[[fut_col]] - fut_data$Domain_Type <- domain_type - fut_data$Domain_Item <- domain_name - fut_data[[fut_col]] <- NULL # Remove the original column - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors with proper levels - all_long_data$TimePerspective <- factor(all_long_data$TimePerspective, levels = c("Past", "Future")) - all_long_data$Domain_Type <- factor(all_long_data$Domain_Type, levels = c("Preferences", "Personality", "Values", "Life_Satisfaction")) - all_long_data$Domain_Item <- factor(all_long_data$Domain_Item, levels = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change")) - all_long_data$pID <- as.factor(all_long_data$pID) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# Display structure and sample of long_data -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -cat("\nColumn names:\n") -print(colnames(long_data)) - -# Show factor levels for domain variables -cat("\nDomain_Type factor levels:\n") -print(levels(long_data$Domain_Type)) - -cat("\nDomain_Item factor levels:\n") -print(levels(long_data$Domain_Item)) - -cat("\nTimePerspective factor levels:\n") -print(levels(long_data$TimePerspective)) - -# Show a sample with actual names instead of numbers -cat("\nSample data with actual names (first 6 rows):\n") -sample_data <- long_data[1:6, c("pID", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(sample_data) - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -head(long_data) -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912113707.r b/.history/mixed anova_20250912113707.r deleted file mode 100644 index 40c09f8..0000000 --- a/.history/mixed anova_20250912113707.r +++ /dev/null @@ -1,370 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - using base R approach to avoid issues - # Past data - past_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", past_col)] - past_data$TimePerspective <- "Past" - past_data$Difference <- past_data[[past_col]] - past_data$Domain_Type <- domain_type - past_data$Domain_Item <- domain_name - past_data[[past_col]] <- NULL # Remove the original column - - # Future data - fut_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", fut_col)] - fut_data$TimePerspective <- "Future" - fut_data$Difference <- fut_data[[fut_col]] - fut_data$Domain_Type <- domain_type - fut_data$Domain_Item <- domain_name - fut_data[[fut_col]] <- NULL # Remove the original column - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors with proper levels - all_long_data$TimePerspective <- factor(all_long_data$TimePerspective, levels = c("Past", "Future")) - all_long_data$Domain_Type <- factor(all_long_data$Domain_Type, levels = c("Preferences", "Personality", "Values", "Life_Satisfaction")) - all_long_data$Domain_Item <- factor(all_long_data$Domain_Item, levels = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change")) - all_long_data$pID <- as.factor(all_long_data$pID) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# Display structure and sample of long_data -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -cat("\nColumn names:\n") -print(colnames(long_data)) - -# Show factor levels for domain variables -cat("\nDomain_Type factor levels:\n") -print(levels(long_data$Domain_Type)) - -cat("\nDomain_Item factor levels:\n") -print(levels(long_data$Domain_Item)) - -cat("\nTimePerspective factor levels:\n") -print(levels(long_data$TimePerspective)) - -# Show a sample with actual names instead of numbers -cat("\nSample data with actual names (first 6 rows):\n") -sample_data <- long_data[1:6, c("pID", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(sample_data) - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -head(long_data) -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912114051.r b/.history/mixed anova_20250912114051.r deleted file mode 100644 index 106251c..0000000 --- a/.history/mixed anova_20250912114051.r +++ /dev/null @@ -1,384 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - using base R approach to avoid issues - # Past data - past_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", past_col)] - past_data$TimePerspective <- "Past" - past_data$Difference <- past_data[[past_col]] - past_data$Domain_Type <- domain_type - past_data$Domain_Item <- domain_name - past_data[[past_col]] <- NULL # Remove the original column - - # Future data - fut_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", fut_col)] - fut_data$TimePerspective <- "Future" - fut_data$Difference <- fut_data[[fut_col]] - fut_data$Domain_Type <- domain_type - fut_data$Domain_Item <- domain_name - fut_data[[fut_col]] <- NULL # Remove the original column - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors with proper levels - all_long_data$TimePerspective <- factor(all_long_data$TimePerspective, levels = c("Past", "Future")) - all_long_data$Domain_Type <- factor(all_long_data$Domain_Type, levels = c("Preferences", "Personality", "Values", "Life_Satisfaction")) - all_long_data$Domain_Item <- factor(all_long_data$Domain_Item, levels = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change")) - all_long_data$pID <- as.factor(all_long_data$pID) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# Display structure and sample of long_data -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -cat("\nColumn names:\n") -print(colnames(long_data)) - -# Show factor levels for domain variables -cat("\nDomain_Type factor levels:\n") -print(levels(long_data$Domain_Type)) - -cat("\nDomain_Item factor levels:\n") -print(levels(long_data$Domain_Item)) - -cat("\nTimePerspective factor levels:\n") -print(levels(long_data$TimePerspective)) - -# Show a sample with actual names instead of numbers -cat("\nSample data with actual names (first 6 rows):\n") -sample_data <- long_data[1:6, c("pID", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(sample_data) - -# Show a better example - one participant across multiple domains -cat("\nExample: Participant 1 across multiple domains (first 10 rows):\n") -participant_1_data <- long_data[long_data$pID == 1, c("pID", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(participant_1_data) - -# Show structure explanation -cat("\nLong format explanation:\n") -cat("- Each participant appears", length(unique(long_data$Domain_Item)) * 2, "times total\n") -cat("- (", length(unique(long_data$Domain_Item)), "domains × 2 time perspectives)\n") -cat("- Total rows per participant:", length(unique(long_data$Domain_Item)) * 2, "\n") -cat("- Total participants:", length(unique(long_data$pID)), "\n") -cat("- Expected total rows:", length(unique(long_data$pID)) * length(unique(long_data$Domain_Item)) * 2, "\n") -cat("- Actual total rows:", nrow(long_data), "\n") - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -head(long_data) -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912114100.r b/.history/mixed anova_20250912114100.r deleted file mode 100644 index 106251c..0000000 --- a/.history/mixed anova_20250912114100.r +++ /dev/null @@ -1,384 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - using base R approach to avoid issues - # Past data - past_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", past_col)] - past_data$TimePerspective <- "Past" - past_data$Difference <- past_data[[past_col]] - past_data$Domain_Type <- domain_type - past_data$Domain_Item <- domain_name - past_data[[past_col]] <- NULL # Remove the original column - - # Future data - fut_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", fut_col)] - fut_data$TimePerspective <- "Future" - fut_data$Difference <- fut_data[[fut_col]] - fut_data$Domain_Type <- domain_type - fut_data$Domain_Item <- domain_name - fut_data[[fut_col]] <- NULL # Remove the original column - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors with proper levels - all_long_data$TimePerspective <- factor(all_long_data$TimePerspective, levels = c("Past", "Future")) - all_long_data$Domain_Type <- factor(all_long_data$Domain_Type, levels = c("Preferences", "Personality", "Values", "Life_Satisfaction")) - all_long_data$Domain_Item <- factor(all_long_data$Domain_Item, levels = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change")) - all_long_data$pID <- as.factor(all_long_data$pID) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# Display structure and sample of long_data -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -cat("\nColumn names:\n") -print(colnames(long_data)) - -# Show factor levels for domain variables -cat("\nDomain_Type factor levels:\n") -print(levels(long_data$Domain_Type)) - -cat("\nDomain_Item factor levels:\n") -print(levels(long_data$Domain_Item)) - -cat("\nTimePerspective factor levels:\n") -print(levels(long_data$TimePerspective)) - -# Show a sample with actual names instead of numbers -cat("\nSample data with actual names (first 6 rows):\n") -sample_data <- long_data[1:6, c("pID", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(sample_data) - -# Show a better example - one participant across multiple domains -cat("\nExample: Participant 1 across multiple domains (first 10 rows):\n") -participant_1_data <- long_data[long_data$pID == 1, c("pID", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(participant_1_data) - -# Show structure explanation -cat("\nLong format explanation:\n") -cat("- Each participant appears", length(unique(long_data$Domain_Item)) * 2, "times total\n") -cat("- (", length(unique(long_data$Domain_Item)), "domains × 2 time perspectives)\n") -cat("- Total rows per participant:", length(unique(long_data$Domain_Item)) * 2, "\n") -cat("- Total participants:", length(unique(long_data$pID)), "\n") -cat("- Expected total rows:", length(unique(long_data$pID)) * length(unique(long_data$Domain_Item)) * 2, "\n") -cat("- Actual total rows:", nrow(long_data), "\n") - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -head(long_data) -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912114105.r b/.history/mixed anova_20250912114105.r deleted file mode 100644 index 106251c..0000000 --- a/.history/mixed anova_20250912114105.r +++ /dev/null @@ -1,384 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - using base R approach to avoid issues - # Past data - past_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", past_col)] - past_data$TimePerspective <- "Past" - past_data$Difference <- past_data[[past_col]] - past_data$Domain_Type <- domain_type - past_data$Domain_Item <- domain_name - past_data[[past_col]] <- NULL # Remove the original column - - # Future data - fut_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", fut_col)] - fut_data$TimePerspective <- "Future" - fut_data$Difference <- fut_data[[fut_col]] - fut_data$Domain_Type <- domain_type - fut_data$Domain_Item <- domain_name - fut_data[[fut_col]] <- NULL # Remove the original column - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors with proper levels - all_long_data$TimePerspective <- factor(all_long_data$TimePerspective, levels = c("Past", "Future")) - all_long_data$Domain_Type <- factor(all_long_data$Domain_Type, levels = c("Preferences", "Personality", "Values", "Life_Satisfaction")) - all_long_data$Domain_Item <- factor(all_long_data$Domain_Item, levels = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change")) - all_long_data$pID <- as.factor(all_long_data$pID) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# Display structure and sample of long_data -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -cat("\nColumn names:\n") -print(colnames(long_data)) - -# Show factor levels for domain variables -cat("\nDomain_Type factor levels:\n") -print(levels(long_data$Domain_Type)) - -cat("\nDomain_Item factor levels:\n") -print(levels(long_data$Domain_Item)) - -cat("\nTimePerspective factor levels:\n") -print(levels(long_data$TimePerspective)) - -# Show a sample with actual names instead of numbers -cat("\nSample data with actual names (first 6 rows):\n") -sample_data <- long_data[1:6, c("pID", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(sample_data) - -# Show a better example - one participant across multiple domains -cat("\nExample: Participant 1 across multiple domains (first 10 rows):\n") -participant_1_data <- long_data[long_data$pID == 1, c("pID", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(participant_1_data) - -# Show structure explanation -cat("\nLong format explanation:\n") -cat("- Each participant appears", length(unique(long_data$Domain_Item)) * 2, "times total\n") -cat("- (", length(unique(long_data$Domain_Item)), "domains × 2 time perspectives)\n") -cat("- Total rows per participant:", length(unique(long_data$Domain_Item)) * 2, "\n") -cat("- Total participants:", length(unique(long_data$pID)), "\n") -cat("- Expected total rows:", length(unique(long_data$pID)) * length(unique(long_data$Domain_Item)) * 2, "\n") -cat("- Actual total rows:", nrow(long_data), "\n") - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -head(long_data) -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912114140.r b/.history/mixed anova_20250912114140.r deleted file mode 100644 index 6d19bba..0000000 --- a/.history/mixed anova_20250912114140.r +++ /dev/null @@ -1,384 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - using base R approach to avoid issues - # Past data - past_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", past_col)] - past_data$TimePerspective <- "Past" - past_data$Difference <- past_data[[past_col]] - past_data$Domain_Type <- domain_type - past_data$Domain_Item <- domain_name - past_data[[past_col]] <- NULL # Remove the original column - - # Future data - fut_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", fut_col)] - fut_data$TimePerspective <- "Future" - fut_data$Difference <- fut_data[[fut_col]] - fut_data$Domain_Type <- domain_type - fut_data$Domain_Item <- domain_name - fut_data[[fut_col]] <- NULL # Remove the original column - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors with proper levels - all_long_data$TimePerspective <- factor(all_long_data$TimePerspective, levels = c("Past", "Future")) - all_long_data$Domain_Type <- factor(all_long_data$Domain_Type, levels = c("Preferences", "Personality", "Values", "Life_Satisfaction")) - all_long_data$Domain_Item <- factor(all_long_data$Domain_Item, levels = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change")) - all_long_data$pID <- as.factor(all_long_data$pID) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# Display structure and sample of long_data -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -cat("\nColumn names:\n") -print(colnames(long_data)) - -# Show factor levels for domain variables -cat("\nDomain_Type factor levels:\n") -print(levels(long_data$Domain_Type)) - -cat("\nDomain_Item factor levels:\n") -print(levels(long_data$Domain_Item)) - -cat("\nTimePerspective factor levels:\n") -print(levels(long_data$TimePerspective)) - -# Show a sample with actual names instead of numbers -cat("\nSample data with actual names (first 6 rows):\n") -sample_data <- long_data[1:6, c("pID", "GROUP", "TASK_DO", "TEMPORAL_DO", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(sample_data) - -# Show a better example - one participant across multiple domains -cat("\nExample: Participant 1 across multiple domains (first 10 rows):\n") -participant_1_data <- long_data[long_data$pID == 1, c("pID", "GROUP", "TASK_DO", "TEMPORAL_DO", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(participant_1_data) - -# Show structure explanation -cat("\nLong format explanation:\n") -cat("- Each participant appears", length(unique(long_data$Domain_Item)) * 2, "times total\n") -cat("- (", length(unique(long_data$Domain_Item)), "domains × 2 time perspectives)\n") -cat("- Total rows per participant:", length(unique(long_data$Domain_Item)) * 2, "\n") -cat("- Total participants:", length(unique(long_data$pID)), "\n") -cat("- Expected total rows:", length(unique(long_data$pID)) * length(unique(long_data$Domain_Item)) * 2, "\n") -cat("- Actual total rows:", nrow(long_data), "\n") - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -head(long_data) -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912114154.r b/.history/mixed anova_20250912114154.r deleted file mode 100644 index 6d19bba..0000000 --- a/.history/mixed anova_20250912114154.r +++ /dev/null @@ -1,384 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - using base R approach to avoid issues - # Past data - past_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", past_col)] - past_data$TimePerspective <- "Past" - past_data$Difference <- past_data[[past_col]] - past_data$Domain_Type <- domain_type - past_data$Domain_Item <- domain_name - past_data[[past_col]] <- NULL # Remove the original column - - # Future data - fut_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", fut_col)] - fut_data$TimePerspective <- "Future" - fut_data$Difference <- fut_data[[fut_col]] - fut_data$Domain_Type <- domain_type - fut_data$Domain_Item <- domain_name - fut_data[[fut_col]] <- NULL # Remove the original column - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors with proper levels - all_long_data$TimePerspective <- factor(all_long_data$TimePerspective, levels = c("Past", "Future")) - all_long_data$Domain_Type <- factor(all_long_data$Domain_Type, levels = c("Preferences", "Personality", "Values", "Life_Satisfaction")) - all_long_data$Domain_Item <- factor(all_long_data$Domain_Item, levels = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change")) - all_long_data$pID <- as.factor(all_long_data$pID) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# Display structure and sample of long_data -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -cat("\nColumn names:\n") -print(colnames(long_data)) - -# Show factor levels for domain variables -cat("\nDomain_Type factor levels:\n") -print(levels(long_data$Domain_Type)) - -cat("\nDomain_Item factor levels:\n") -print(levels(long_data$Domain_Item)) - -cat("\nTimePerspective factor levels:\n") -print(levels(long_data$TimePerspective)) - -# Show a sample with actual names instead of numbers -cat("\nSample data with actual names (first 6 rows):\n") -sample_data <- long_data[1:6, c("pID", "GROUP", "TASK_DO", "TEMPORAL_DO", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(sample_data) - -# Show a better example - one participant across multiple domains -cat("\nExample: Participant 1 across multiple domains (first 10 rows):\n") -participant_1_data <- long_data[long_data$pID == 1, c("pID", "GROUP", "TASK_DO", "TEMPORAL_DO", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(participant_1_data) - -# Show structure explanation -cat("\nLong format explanation:\n") -cat("- Each participant appears", length(unique(long_data$Domain_Item)) * 2, "times total\n") -cat("- (", length(unique(long_data$Domain_Item)), "domains × 2 time perspectives)\n") -cat("- Total rows per participant:", length(unique(long_data$Domain_Item)) * 2, "\n") -cat("- Total participants:", length(unique(long_data$pID)), "\n") -cat("- Expected total rows:", length(unique(long_data$pID)) * length(unique(long_data$Domain_Item)) * 2, "\n") -cat("- Actual total rows:", nrow(long_data), "\n") - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -head(long_data) -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912114156.r b/.history/mixed anova_20250912114156.r deleted file mode 100644 index 6d19bba..0000000 --- a/.history/mixed anova_20250912114156.r +++ /dev/null @@ -1,384 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - using base R approach to avoid issues - # Past data - past_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", past_col)] - past_data$TimePerspective <- "Past" - past_data$Difference <- past_data[[past_col]] - past_data$Domain_Type <- domain_type - past_data$Domain_Item <- domain_name - past_data[[past_col]] <- NULL # Remove the original column - - # Future data - fut_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", fut_col)] - fut_data$TimePerspective <- "Future" - fut_data$Difference <- fut_data[[fut_col]] - fut_data$Domain_Type <- domain_type - fut_data$Domain_Item <- domain_name - fut_data[[fut_col]] <- NULL # Remove the original column - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors with proper levels - all_long_data$TimePerspective <- factor(all_long_data$TimePerspective, levels = c("Past", "Future")) - all_long_data$Domain_Type <- factor(all_long_data$Domain_Type, levels = c("Preferences", "Personality", "Values", "Life_Satisfaction")) - all_long_data$Domain_Item <- factor(all_long_data$Domain_Item, levels = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change")) - all_long_data$pID <- as.factor(all_long_data$pID) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# Display structure and sample of long_data -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -cat("\nColumn names:\n") -print(colnames(long_data)) - -# Show factor levels for domain variables -cat("\nDomain_Type factor levels:\n") -print(levels(long_data$Domain_Type)) - -cat("\nDomain_Item factor levels:\n") -print(levels(long_data$Domain_Item)) - -cat("\nTimePerspective factor levels:\n") -print(levels(long_data$TimePerspective)) - -# Show a sample with actual names instead of numbers -cat("\nSample data with actual names (first 6 rows):\n") -sample_data <- long_data[1:6, c("pID", "GROUP", "TASK_DO", "TEMPORAL_DO", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(sample_data) - -# Show a better example - one participant across multiple domains -cat("\nExample: Participant 1 across multiple domains (first 10 rows):\n") -participant_1_data <- long_data[long_data$pID == 1, c("pID", "GROUP", "TASK_DO", "TEMPORAL_DO", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(participant_1_data) - -# Show structure explanation -cat("\nLong format explanation:\n") -cat("- Each participant appears", length(unique(long_data$Domain_Item)) * 2, "times total\n") -cat("- (", length(unique(long_data$Domain_Item)), "domains × 2 time perspectives)\n") -cat("- Total rows per participant:", length(unique(long_data$Domain_Item)) * 2, "\n") -cat("- Total participants:", length(unique(long_data$pID)), "\n") -cat("- Expected total rows:", length(unique(long_data$pID)) * length(unique(long_data$Domain_Item)) * 2, "\n") -cat("- Actual total rows:", nrow(long_data), "\n") - -# STEP 2: ASSUMPTION CHECKING -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 2: CHECKING ASSUMPTIONS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -head(long_data) -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912114707.r b/.history/mixed anova_20250912114707.r deleted file mode 100644 index fc6e940..0000000 --- a/.history/mixed anova_20250912114707.r +++ /dev/null @@ -1,398 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# Check what columns are actually available -cat("\nChecking available columns with NPast and NFut prefixes:\n") -npast_cols <- colnames(data)[grepl("^NPast", colnames(data))] -nfut_cols <- colnames(data)[grepl("^NFut", colnames(data))] - -cat("NPast columns found:\n") -print(npast_cols) - -cat("\nNFut columns found:\n") -print(nfut_cols) - -cat("\nTotal NPast columns:", length(npast_cols), "\n") -cat("Total NFut columns:", length(nfut_cols), "\n") - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - using base R approach to avoid issues - # Past data - past_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", past_col)] - past_data$TimePerspective <- "Past" - past_data$Difference <- past_data[[past_col]] - past_data$Domain_Type <- domain_type - past_data$Domain_Item <- domain_name - past_data[[past_col]] <- NULL # Remove the original column - - # Future data - fut_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", fut_col)] - fut_data$TimePerspective <- "Future" - fut_data$Difference <- fut_data[[fut_col]] - fut_data$Domain_Type <- domain_type - fut_data$Domain_Item <- domain_name - fut_data[[fut_col]] <- NULL # Remove the original column - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors with proper levels - all_long_data$TimePerspective <- factor(all_long_data$TimePerspective, levels = c("Past", "Future")) - all_long_data$Domain_Type <- factor(all_long_data$Domain_Type, levels = c("Preferences", "Personality", "Values", "Life_Satisfaction")) - all_long_data$Domain_Item <- factor(all_long_data$Domain_Item, levels = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change")) - all_long_data$pID <- as.factor(all_long_data$pID) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# Display structure and sample of long_data -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -cat("\nColumn names:\n") -print(colnames(long_data)) - -# Show factor levels for domain variables -cat("\nDomain_Type factor levels:\n") -print(levels(long_data$Domain_Type)) - -cat("\nDomain_Item factor levels:\n") -print(levels(long_data$Domain_Item)) - -cat("\nTimePerspective factor levels:\n") -print(levels(long_data$TimePerspective)) - -# Show a sample with actual names instead of numbers -cat("\nSample data with actual names (first 6 rows):\n") -sample_data <- long_data[1:6, c("pID", "GROUP", "TASK_DO", "TEMPORAL_DO", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(sample_data) - -# Show a better example - one participant across multiple domains -cat("\nExample: Participant 1 across multiple domains (first 10 rows):\n") -participant_1_data <- long_data[long_data$pID == 1, c("pID", "GROUP", "TASK_DO", "TEMPORAL_DO", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(participant_1_data) - -# Show structure explanation -cat("\nLong format explanation:\n") -cat("- Each participant appears", length(unique(long_data$Domain_Item)) * 2, "times total\n") -cat("- (", length(unique(long_data$Domain_Item)), "domains × 2 time perspectives)\n") -cat("- Total rows per participant:", length(unique(long_data$Domain_Item)) * 2, "\n") -cat("- Total participants:", length(unique(long_data$pID)), "\n") -cat("- Expected total rows:", length(unique(long_data$pID)) * length(unique(long_data$Domain_Item)) * 2, "\n") -cat("- Actual total rows:", nrow(long_data), "\n") - -# STEP 2: ASSUMPTION CHECKING - -cat("STEP 2: CHECKING ASSUMPTIONS\n") - -head(long_data) -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912114714.r b/.history/mixed anova_20250912114714.r deleted file mode 100644 index fc6e940..0000000 --- a/.history/mixed anova_20250912114714.r +++ /dev/null @@ -1,398 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# Check what columns are actually available -cat("\nChecking available columns with NPast and NFut prefixes:\n") -npast_cols <- colnames(data)[grepl("^NPast", colnames(data))] -nfut_cols <- colnames(data)[grepl("^NFut", colnames(data))] - -cat("NPast columns found:\n") -print(npast_cols) - -cat("\nNFut columns found:\n") -print(nfut_cols) - -cat("\nTotal NPast columns:", length(npast_cols), "\n") -cat("Total NFut columns:", length(nfut_cols), "\n") - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - using base R approach to avoid issues - # Past data - past_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", past_col)] - past_data$TimePerspective <- "Past" - past_data$Difference <- past_data[[past_col]] - past_data$Domain_Type <- domain_type - past_data$Domain_Item <- domain_name - past_data[[past_col]] <- NULL # Remove the original column - - # Future data - fut_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", fut_col)] - fut_data$TimePerspective <- "Future" - fut_data$Difference <- fut_data[[fut_col]] - fut_data$Domain_Type <- domain_type - fut_data$Domain_Item <- domain_name - fut_data[[fut_col]] <- NULL # Remove the original column - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors with proper levels - all_long_data$TimePerspective <- factor(all_long_data$TimePerspective, levels = c("Past", "Future")) - all_long_data$Domain_Type <- factor(all_long_data$Domain_Type, levels = c("Preferences", "Personality", "Values", "Life_Satisfaction")) - all_long_data$Domain_Item <- factor(all_long_data$Domain_Item, levels = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change")) - all_long_data$pID <- as.factor(all_long_data$pID) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# Display structure and sample of long_data -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -cat("\nColumn names:\n") -print(colnames(long_data)) - -# Show factor levels for domain variables -cat("\nDomain_Type factor levels:\n") -print(levels(long_data$Domain_Type)) - -cat("\nDomain_Item factor levels:\n") -print(levels(long_data$Domain_Item)) - -cat("\nTimePerspective factor levels:\n") -print(levels(long_data$TimePerspective)) - -# Show a sample with actual names instead of numbers -cat("\nSample data with actual names (first 6 rows):\n") -sample_data <- long_data[1:6, c("pID", "GROUP", "TASK_DO", "TEMPORAL_DO", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(sample_data) - -# Show a better example - one participant across multiple domains -cat("\nExample: Participant 1 across multiple domains (first 10 rows):\n") -participant_1_data <- long_data[long_data$pID == 1, c("pID", "GROUP", "TASK_DO", "TEMPORAL_DO", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(participant_1_data) - -# Show structure explanation -cat("\nLong format explanation:\n") -cat("- Each participant appears", length(unique(long_data$Domain_Item)) * 2, "times total\n") -cat("- (", length(unique(long_data$Domain_Item)), "domains × 2 time perspectives)\n") -cat("- Total rows per participant:", length(unique(long_data$Domain_Item)) * 2, "\n") -cat("- Total participants:", length(unique(long_data$pID)), "\n") -cat("- Expected total rows:", length(unique(long_data$pID)) * length(unique(long_data$Domain_Item)) * 2, "\n") -cat("- Actual total rows:", nrow(long_data), "\n") - -# STEP 2: ASSUMPTION CHECKING - -cat("STEP 2: CHECKING ASSUMPTIONS\n") - -head(long_data) -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n") diff --git a/.history/mixed anova_20250912124604.r b/.history/mixed anova_20250912124604.r deleted file mode 100644 index fc6e940..0000000 --- a/.history/mixed anova_20250912124604.r +++ /dev/null @@ -1,398 +0,0 @@ -# Mixed ANOVA Analysis for Past vs Future Differences -# EOHI Experiment Data Analysis - -# Load required libraries -library(tidyverse) -library(ez) -library(car) -library(nortest) # For normality tests - -# Read the data -data <- read.csv("eohi1/exp1.csv") - -# Display basic information about the dataset -cat("Dataset dimensions:", dim(data), "\n") -cat("Number of participants:", length(unique(data$pID)), "\n") - -# Check experimental conditions -cat("\nExperimental conditions:\n") -table(data$GROUP, data$TASK_DO, data$TEMPORAL_DO) - -# Check what columns are actually available -cat("\nChecking available columns with NPast and NFut prefixes:\n") -npast_cols <- colnames(data)[grepl("^NPast", colnames(data))] -nfut_cols <- colnames(data)[grepl("^NFut", colnames(data))] - -cat("NPast columns found:\n") -print(npast_cols) - -cat("\nNFut columns found:\n") -print(nfut_cols) - -cat("\nTotal NPast columns:", length(npast_cols), "\n") -cat("Total NFut columns:", length(nfut_cols), "\n") - -# STEP 1: PROPER DATA RESHAPING -# Define domains with their categories -domain_info <- data.frame( - domain = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change"), - domain_type = c(rep("Preferences", 5), - rep("Personality", 5), - rep("Values", 5), - rep("Life_Satisfaction", 5)), - stringsAsFactors = FALSE -) - -# Display domain_info -cat("\nDomain Information:\n") -print(domain_info) -cat("\nDomain type summary:\n") -print(table(domain_info$domain_type)) - -# Function to reshape ALL domains at once with domain information -reshape_all_domains <- function(data, domain_info) { - all_long_data <- data.frame() - - for (i in 1:nrow(domain_info)) { - domain_name <- domain_info$domain[i] - domain_type <- domain_info$domain_type[i] - - past_col <- paste0("NPastDiff_", domain_name) - fut_col <- paste0("NFutDiff_", domain_name) - - # Check if columns exist - if (!(past_col %in% colnames(data)) || !(fut_col %in% colnames(data))) { - cat("Warning: Columns", past_col, "or", fut_col, "not found\n") - next - } - - # Create long format data for this domain - using base R approach to avoid issues - # Past data - past_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", past_col)] - past_data$TimePerspective <- "Past" - past_data$Difference <- past_data[[past_col]] - past_data$Domain_Type <- domain_type - past_data$Domain_Item <- domain_name - past_data[[past_col]] <- NULL # Remove the original column - - # Future data - fut_data <- data[, c("pID", "ResponseId", "GROUP", "TASK_DO", "TEMPORAL_DO", "ITEM_DO", "COC_DO", - "demo_sex", "demo_age_1", "AOT_total", "CRT_correct", fut_col)] - fut_data$TimePerspective <- "Future" - fut_data$Difference <- fut_data[[fut_col]] - fut_data$Domain_Type <- domain_type - fut_data$Domain_Item <- domain_name - fut_data[[fut_col]] <- NULL # Remove the original column - - # Combine past and future data for this domain - domain_long_data <- rbind(past_data, fut_data) - all_long_data <- rbind(all_long_data, domain_long_data) - } - - # Convert to factors with proper levels - all_long_data$TimePerspective <- factor(all_long_data$TimePerspective, levels = c("Past", "Future")) - all_long_data$Domain_Type <- factor(all_long_data$Domain_Type, levels = c("Preferences", "Personality", "Values", "Life_Satisfaction")) - all_long_data$Domain_Item <- factor(all_long_data$Domain_Item, levels = c("pref_read", "pref_music", "pref_tv", "pref_nap", "pref_travel", - "pers_extravert", "pers_critical", "pers_dependable", "pers_anxious", "pers_complex", - "val_obey", "val_trad", "val_opinion", "val_performance", "val_justice", - "life_ideal", "life_excellent", "life_satisfied", "life_important", "life_change")) - all_long_data$pID <- as.factor(all_long_data$pID) - - return(all_long_data) -} - -# Reshape all data to long format -cat("\nReshaping data to long format...\n") -long_data <- reshape_all_domains(data, domain_info) - -cat("Long format data dimensions:", dim(long_data), "\n") -cat("Unique domains:", length(unique(long_data$Domain_Item)), "\n") -cat("Domain types:", paste(unique(long_data$Domain_Type), collapse = ", "), "\n") -cat("Domain type counts:\n") -print(table(long_data$Domain_Type)) - -# Display structure and sample of long_data -cat("\nLong data structure:\n") -str(long_data) - -cat("\nFirst 10 rows of long_data:\n") -print(head(long_data, 10)) - -cat("\nColumn names:\n") -print(colnames(long_data)) - -# Show factor levels for domain variables -cat("\nDomain_Type factor levels:\n") -print(levels(long_data$Domain_Type)) - -cat("\nDomain_Item factor levels:\n") -print(levels(long_data$Domain_Item)) - -cat("\nTimePerspective factor levels:\n") -print(levels(long_data$TimePerspective)) - -# Show a sample with actual names instead of numbers -cat("\nSample data with actual names (first 6 rows):\n") -sample_data <- long_data[1:6, c("pID", "GROUP", "TASK_DO", "TEMPORAL_DO", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(sample_data) - -# Show a better example - one participant across multiple domains -cat("\nExample: Participant 1 across multiple domains (first 10 rows):\n") -participant_1_data <- long_data[long_data$pID == 1, c("pID", "GROUP", "TASK_DO", "TEMPORAL_DO", "Domain_Type", "Domain_Item", "TimePerspective", "Difference")] -print(participant_1_data) - -# Show structure explanation -cat("\nLong format explanation:\n") -cat("- Each participant appears", length(unique(long_data$Domain_Item)) * 2, "times total\n") -cat("- (", length(unique(long_data$Domain_Item)), "domains × 2 time perspectives)\n") -cat("- Total rows per participant:", length(unique(long_data$Domain_Item)) * 2, "\n") -cat("- Total participants:", length(unique(long_data$pID)), "\n") -cat("- Expected total rows:", length(unique(long_data$pID)) * length(unique(long_data$Domain_Item)) * 2, "\n") -cat("- Actual total rows:", nrow(long_data), "\n") - -# STEP 2: ASSUMPTION CHECKING - -cat("STEP 2: CHECKING ASSUMPTIONS\n") - -head(long_data) -# 2.1 Check for missing values -missing_summary <- long_data %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n_total = n(), - n_missing = sum(is.na(Difference)), - pct_missing = round(100 * n_missing / n_total, 2), - .groups = 'drop' - ) - -cat("\nMissing values by domain and time perspective:\n") -print(missing_summary) - -# Remove missing values -long_data_clean <- long_data[!is.na(long_data$Difference), ] -cat("\nData after removing missing values:", dim(long_data_clean), "\n") - -# 2.2 Outlier detection -cat("\nChecking for outliers...\n") -outlier_summary <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - iqr = q3 - q1, - lower_bound = q1 - 1.5 * iqr, - upper_bound = q3 + 1.5 * iqr, - n_outliers = sum(Difference < lower_bound | Difference > upper_bound), - .groups = 'drop' - ) - -cat("Outlier summary (IQR method):\n") -print(outlier_summary) - -# 2.3 Normality tests -cat("\nTesting normality...\n") -normality_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - shapiro_p = ifelse(n >= 3 & n <= 5000, - shapiro.test(Difference)$p.value, - NA), - anderson_p = ifelse(n >= 7, - ad.test(Difference)$p.value, - NA), - .groups = 'drop' - ) %>% - mutate( - shapiro_normal = shapiro_p > 0.05, - anderson_normal = anderson_p > 0.05, - overall_normal = case_when( - !is.na(shapiro_p) & !is.na(anderson_p) ~ shapiro_normal & anderson_normal, - !is.na(shapiro_p) ~ shapiro_normal, - !is.na(anderson_p) ~ anderson_normal, - TRUE ~ NA - ) - ) - -cat("Normality test results:\n") -print(normality_results) - -# 2.4 Homogeneity of variance (Levene's test) -cat("\nTesting homogeneity of variance...\n") -homogeneity_results <- long_data_clean %>% - group_by(Domain_Type, Domain_Item) %>% - summarise( - levene_p = leveneTest(Difference ~ TimePerspective)$`Pr(>F)`[1], - homogeneous = levene_p > 0.05, - .groups = 'drop' - ) - -cat("Homogeneity of variance results:\n") -print(homogeneity_results) - -# STEP 3: DESCRIPTIVE STATISTICS -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 3: DESCRIPTIVE STATISTICS\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -desc_stats <- long_data_clean %>% - group_by(Domain_Type, Domain_Item, TimePerspective) %>% - summarise( - n = n(), - mean = mean(Difference), - sd = sd(Difference), - median = median(Difference), - q1 = quantile(Difference, 0.25), - q3 = quantile(Difference, 0.75), - min = min(Difference), - max = max(Difference), - .groups = 'drop' - ) - -cat("Descriptive statistics:\n") -print(desc_stats) - -# STEP 4: MIXED ANOVA ANALYSES -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("STEP 4: MIXED ANOVA ANALYSES\n") -cat(paste(rep("=", 80), collapse = ""), "\n") - -# 4.1 Overall analysis across all domains -cat("\n4.1 Overall Mixed ANOVA (all domains combined):\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -tryCatch({ - overall_anova <- ezANOVA( - data = long_data_clean, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Type), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE, - return_aov = TRUE - ) - - cat("Overall ANOVA Results:\n") - print(overall_anova) - - # Check sphericity - if (!is.null(overall_anova$`Mauchly's Test for Sphericity`)) { - cat("\nSphericity test results:\n") - print(overall_anova$`Mauchly's Test for Sphericity`) - } - -}, error = function(e) { - cat("Error in overall ANOVA:", e$message, "\n") -}) - -# 4.2 Domain-specific analyses -cat("\n4.2 Domain-specific Mixed ANOVAs:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -domain_results <- list() - -for (domain_type in unique(long_data_clean$Domain_Type)) { - cat("\nAnalyzing domain type:", domain_type, "\n") - - domain_data <- long_data_clean[long_data_clean$Domain_Type == domain_type, ] - - tryCatch({ - domain_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = c(TimePerspective, Domain_Item), - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_type, ":\n") - print(domain_anova) - - domain_results[[domain_type]] <- domain_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_type, ":", e$message, "\n") - - # Fallback to simpler analysis - cat("Attempting simpler repeated measures ANOVA...\n") - tryCatch({ - simple_anova <- ezANOVA( - data = domain_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - print(simple_anova) - domain_results[[domain_type]] <- simple_anova - }, error = function(e2) { - cat("Simple ANOVA also failed:", e2$message, "\n") - }) - }) -} - -# 4.3 Individual domain item analyses -cat("\n4.3 Individual Domain Item Analyses:\n") -cat(paste(rep("-", 50), collapse = ""), "\n") - -item_results <- list() - -for (domain_item in unique(long_data_clean$Domain_Item)) { - cat("\nAnalyzing individual item:", domain_item, "\n") - - item_data <- long_data_clean[long_data_clean$Domain_Item == domain_item, ] - - tryCatch({ - item_anova <- ezANOVA( - data = item_data, - dv = Difference, - wid = pID, - within = TimePerspective, - between = c(GROUP, TASK_DO), - type = 3, - detailed = TRUE - ) - - cat("ANOVA results for", domain_item, ":\n") - print(item_anova) - - item_results[[domain_item]] <- item_anova - - }, error = function(e) { - cat("Error in ANOVA for", domain_item, ":", e$message, "\n") - - # Fallback to paired t-test - past_vals <- item_data$Difference[item_data$TimePerspective == "Past"] - fut_vals <- item_data$Difference[item_data$TimePerspective == "Future"] - - if (length(past_vals) > 1 && length(fut_vals) > 1) { - t_test <- t.test(past_vals, fut_vals, paired = TRUE) - cat("Fallback paired t-test for", domain_item, ":\n") - cat("t =", round(t_test$statistic, 3), - ", df =", t_test$parameter, - ", p =", round(t_test$p.value, 5), "\n") - - item_results[[domain_item]] <- t_test - } - }) -} - -cat("\n", paste(rep("=", 80), collapse = ""), "\n") -cat("ANALYSIS COMPLETE!\n") -cat(paste(rep("=", 80), collapse = ""), "\n") -cat("Summary:\n") -cat("- Total domains analyzed:", length(unique(long_data_clean$Domain_Item)), "\n") -cat("- Domain types analyzed:", length(unique(long_data_clean$Domain_Type)), "\n") -cat("- Individual item analyses completed:", length(item_results), "\n")